Files
kernel_arpi/mm/mempolicy.c
Greg Kroah-Hartman 47c7e57022 Merge 5.15.61 into android14-5.15
Changes in 5.15.61
        Makefile: link with -z noexecstack --no-warn-rwx-segments
        x86: link vdso and boot with -z noexecstack --no-warn-rwx-segments
        Revert "pNFS: nfs3_set_ds_client should set NFS_CS_NOPING"
        scsi: Revert "scsi: qla2xxx: Fix disk failure to rediscover"
        pNFS/flexfiles: Report RDMA connection errors to the server
        NFSD: Clean up the show_nf_flags() macro
        nfsd: eliminate the NFSD_FILE_BREAK_* flags
        ALSA: usb-audio: Add quirk for Behringer UMC202HD
        ALSA: bcd2000: Fix a UAF bug on the error path of probing
        ALSA: hda/realtek: Add quirk for Clevo NV45PZ
        ALSA: hda/realtek: Add quirk for HP Spectre x360 15-eb0xxx
        wifi: mac80211_hwsim: fix race condition in pending packet
        wifi: mac80211_hwsim: add back erroneously removed cast
        wifi: mac80211_hwsim: use 32-bit skb cookie
        add barriers to buffer_uptodate and set_buffer_uptodate
        lockd: detect and reject lock arguments that overflow
        HID: hid-input: add Surface Go battery quirk
        HID: wacom: Only report rotation for art pen
        HID: wacom: Don't register pad_input for touch switch
        KVM: nVMX: Snapshot pre-VM-Enter BNDCFGS for !nested_run_pending case
        KVM: nVMX: Snapshot pre-VM-Enter DEBUGCTL for !nested_run_pending case
        KVM: SVM: Don't BUG if userspace injects an interrupt with GIF=0
        KVM: s390: pv: don't present the ecall interrupt twice
        KVM: x86: Split kvm_is_valid_cr4() and export only the non-vendor bits
        KVM: nVMX: Let userspace set nVMX MSR to any _host_ supported value
        KVM: nVMX: Account for KVM reserved CR4 bits in consistency checks
        KVM: nVMX: Inject #UD if VMXON is attempted with incompatible CR0/CR4
        KVM: x86: Mark TSS busy during LTR emulation _after_ all fault checks
        KVM: x86: Set error code to segment selector on LLDT/LTR non-canonical #GP
        KVM: nVMX: Always enable TSC scaling for L2 when it was enabled for L1
        KVM: x86: Tag kvm_mmu_x86_module_init() with __init
        KVM: x86: do not report preemption if the steal time cache is stale
        KVM: x86: revalidate steal time cache if MSR value changes
        riscv: set default pm_power_off to NULL
        ALSA: hda/conexant: Add quirk for LENOVO 20149 Notebook model
        ALSA: hda/cirrus - support for iMac 12,1 model
        ALSA: hda/realtek: Add quirk for another Asus K42JZ model
        ALSA: hda/realtek: Add a quirk for HP OMEN 15 (8786) mute LED
        tty: vt: initialize unicode screen buffer
        vfs: Check the truncate maximum size in inode_newsize_ok()
        fs: Add missing umask strip in vfs_tmpfile
        thermal: sysfs: Fix cooling_device_stats_setup() error code path
        fbcon: Fix boundary checks for fbcon=vc:n1-n2 parameters
        fbcon: Fix accelerated fbdev scrolling while logo is still shown
        usbnet: Fix linkwatch use-after-free on disconnect
        fix short copy handling in copy_mc_pipe_to_iter()
        crypto: ccp - Use kzalloc for sev ioctl interfaces to prevent kernel memory leak
        ovl: drop WARN_ON() dentry is NULL in ovl_encode_fh()
        parisc: Fix device names in /proc/iomem
        parisc: Drop pa_swapper_pg_lock spinlock
        parisc: Check the return value of ioremap() in lba_driver_probe()
        parisc: io_pgetevents_time64() needs compat syscall in 32-bit compat mode
        riscv:uprobe fix SR_SPIE set/clear handling
        dt-bindings: riscv: fix SiFive l2-cache's cache-sets
        RISC-V: kexec: Fixup use of smp_processor_id() in preemptible context
        RISC-V: Fixup get incorrect user mode PC for kernel mode regs
        RISC-V: Fixup schedule out issue in machine_crash_shutdown()
        RISC-V: Add modules to virtual kernel memory layout dump
        rtc: rx8025: fix 12/24 hour mode detection on RX-8035
        drm/gem: Properly annotate WW context on drm_gem_lock_reservations() error
        drm/shmem-helper: Add missing vunmap on error
        drm/vc4: hdmi: Disable audio if dmas property is present but empty
        drm/hyperv-drm: Include framebuffer and EDID headers
        drm/nouveau: fix another off-by-one in nvbios_addr
        drm/nouveau: Don't pm_runtime_put_sync(), only pm_runtime_put_autosuspend()
        drm/nouveau/acpi: Don't print error when we get -EINPROGRESS from pm_runtime
        drm/nouveau/kms: Fix failure path for creating DP connectors
        drm/amdgpu: Check BO's requested pinning domains against its preferred_domains
        drm/amdgpu: fix check in fbdev init
        bpf: Fix KASAN use-after-free Read in compute_effective_progs
        btrfs: reject log replay if there is unsupported RO compat flag
        mtd: rawnand: arasan: Fix clock rate in NV-DDR
        mtd: rawnand: arasan: Update NAND bus clock instead of system clock
        um: Remove straying parenthesis
        um: seed rng using host OS rng
        iio: fix iio_format_avail_range() printing for none IIO_VAL_INT
        iio: light: isl29028: Fix the warning in isl29028_remove()
        scsi: sg: Allow waiting for commands to complete on removed device
        scsi: qla2xxx: Fix incorrect display of max frame size
        scsi: qla2xxx: Zero undefined mailbox IN registers
        soundwire: qcom: Check device status before reading devid
        ksmbd: fix memory leak in smb2_handle_negotiate
        ksmbd: prevent out of bound read for SMB2_TREE_CONNNECT
        ksmbd: fix use-after-free bug in smb2_tree_disconect
        fuse: limit nsec
        fuse: ioctl: translate ENOSYS
        serial: mvebu-uart: uart2 error bits clearing
        md-raid: destroy the bitmap after destroying the thread
        md-raid10: fix KASAN warning
        mbcache: don't reclaim used entries
        mbcache: add functions to delete entry if unused
        media: [PATCH] pci: atomisp_cmd: fix three missing checks on list iterator
        ia64, processor: fix -Wincompatible-pointer-types in ia64_get_irr()
        PCI: Add defines for normal and subtractive PCI bridges
        powerpc/fsl-pci: Fix Class Code of PCIe Root Port
        powerpc/ptdump: Fix display of RW pages on FSL_BOOK3E
        powerpc/powernv: Avoid crashing if rng is NULL
        MIPS: cpuinfo: Fix a warning for CONFIG_CPUMASK_OFFSTACK
        coresight: Clear the connection field properly
        usb: typec: ucsi: Acknowledge the GET_ERROR_STATUS command completion
        USB: HCD: Fix URB giveback issue in tasklet function
        ARM: dts: uniphier: Fix USB interrupts for PXs2 SoC
        arm64: dts: uniphier: Fix USB interrupts for PXs3 SoC
        usb: dwc3: gadget: refactor dwc3_repare_one_trb
        usb: dwc3: gadget: fix high speed multiplier setting
        netfilter: nf_tables: do not allow SET_ID to refer to another table
        netfilter: nf_tables: do not allow CHAIN_ID to refer to another table
        netfilter: nf_tables: do not allow RULE_ID to refer to another chain
        netfilter: nf_tables: fix null deref due to zeroed list head
        epoll: autoremove wakers even more aggressively
        x86: Handle idle=nomwait cmdline properly for x86_idle
        arch: make TRACE_IRQFLAGS_NMI_SUPPORT generic
        arm64: Do not forget syscall when starting a new thread.
        arm64: fix oops in concurrently setting insn_emulation sysctls
        arm64: kasan: Revert "arm64: mte: reset the page tag in page->flags"
        ext2: Add more validity checks for inode counts
        sched/fair: Introduce SIS_UTIL to search idle CPU based on sum of util_avg
        genirq: Don't return error on missing optional irq_request_resources()
        irqchip/mips-gic: Only register IPI domain when SMP is enabled
        genirq: GENERIC_IRQ_IPI depends on SMP
        sched/core: Always flush pending blk_plug
        irqchip/mips-gic: Check the return value of ioremap() in gic_of_init()
        wait: Fix __wait_event_hrtimeout for RT/DL tasks
        ARM: dts: imx6ul: add missing properties for sram
        ARM: dts: imx6ul: change operating-points to uint32-matrix
        ARM: dts: imx6ul: fix keypad compatible
        ARM: dts: imx6ul: fix csi node compatible
        ARM: dts: imx6ul: fix lcdif node compatible
        ARM: dts: imx6ul: fix qspi node compatible
        ARM: dts: BCM5301X: Add DT for Meraki MR26
        ARM: dts: ux500: Fix Codina accelerometer mounting matrix
        ARM: dts: ux500: Fix Gavini accelerometer mounting matrix
        spi: synquacer: Add missing clk_disable_unprepare()
        ARM: OMAP2+: display: Fix refcount leak bug
        ARM: OMAP2+: pdata-quirks: Fix refcount leak bug
        ACPI: EC: Remove duplicate ThinkPad X1 Carbon 6th entry from DMI quirks
        ACPI: EC: Drop the EC_FLAGS_IGNORE_DSDT_GPE quirk
        ACPI: PM: save NVS memory for Lenovo G40-45
        ACPI: LPSS: Fix missing check in register_device_clock()
        ARM: dts: qcom: sdx55: Fix the IRQ trigger type for UART
        arm64: dts: qcom: ipq8074: fix NAND node name
        arm64: dts: allwinner: a64: orangepi-win: Fix LED node name
        ARM: shmobile: rcar-gen2: Increase refcount for new reference
        firmware: tegra: Fix error check return value of debugfs_create_file()
        hwmon: (dell-smm) Add Dell XPS 13 7390 to fan control whitelist
        hwmon: (sht15) Fix wrong assumptions in device remove callback
        PM: hibernate: defer device probing when resuming from hibernation
        selinux: fix memleak in security_read_state_kernel()
        selinux: Add boundary check in put_entry()
        kasan: test: Silence GCC 12 warnings
        drm/amdgpu: Remove one duplicated ef removal
        powerpc/64s: Disable stack variable initialisation for prom_init
        spi: spi-rspi: Fix PIO fallback on RZ platforms
        ARM: findbit: fix overflowing offset
        meson-mx-socinfo: Fix refcount leak in meson_mx_socinfo_init
        arm64: dts: renesas: beacon: Fix regulator node names
        spi: spi-altera-dfl: Fix an error handling path
        ARM: bcm: Fix refcount leak in bcm_kona_smc_init
        ACPI: processor/idle: Annotate more functions to live in cpuidle section
        ARM: dts: imx7d-colibri-emmc: add cpu1 supply
        soc: renesas: r8a779a0-sysc: Fix A2DP1 and A2CV[2357] PDR values
        scsi: hisi_sas: Use managed PCI functions
        dt-bindings: iio: accel: Add DT binding doc for ADXL355
        soc: amlogic: Fix refcount leak in meson-secure-pwrc.c
        arm64: dts: renesas: Fix thermal-sensors on single-zone sensors
        x86/pmem: Fix platform-device leak in error path
        ARM: dts: ast2500-evb: fix board compatible
        ARM: dts: ast2600-evb: fix board compatible
        ARM: dts: ast2600-evb-a1: fix board compatible
        arm64: dts: mt8192: Fix idle-states nodes naming scheme
        arm64: dts: mt8192: Fix idle-states entry-method
        arm64: select TRACE_IRQFLAGS_NMI_SUPPORT
        arm64: cpufeature: Allow different PMU versions in ID_DFR0_EL1
        locking/lockdep: Fix lockdep_init_map_*() confusion
        arm64: dts: qcom: sc7180: Remove ipa_fw_mem node on trogdor
        soc: fsl: guts: machine variable might be unset
        block: fix infinite loop for invalid zone append
        ARM: dts: qcom: mdm9615: add missing PMIC GPIO reg
        ARM: OMAP2+: Fix refcount leak in omapdss_init_of
        ARM: OMAP2+: Fix refcount leak in omap3xxx_prm_late_init
        arm64: dts: qcom: sdm630: disable GPU by default
        arm64: dts: qcom: sdm630: fix the qusb2phy ref clock
        arm64: dts: qcom: sdm630: fix gpu's interconnect path
        arm64: dts: qcom: sdm636-sony-xperia-ganges-mermaid: correct sdc2 pinconf
        cpufreq: zynq: Fix refcount leak in zynq_get_revision
        regulator: qcom_smd: Fix pm8916_pldo range
        ACPI: APEI: Fix _EINJ vs EFI_MEMORY_SP
        ARM: dts: qcom-msm8974: fix irq type on blsp2_uart1
        soc: qcom: ocmem: Fix refcount leak in of_get_ocmem
        soc: qcom: aoss: Fix refcount leak in qmp_cooling_devices_register
        ARM: dts: qcom: pm8841: add required thermal-sensor-cells
        bus: hisi_lpc: fix missing platform_device_put() in hisi_lpc_acpi_probe()
        stack: Declare {randomize_,}kstack_offset to fix Sparse warnings
        arm64: dts: qcom: msm8916: Fix typo in pronto remoteproc node
        ACPI: APEI: explicit init of HEST and GHES in apci_init()
        drivers/iio: Remove all strcpy() uses
        ACPI: VIOT: Fix ACS setup
        arm64: dts: qcom: sm6125: Move sdc2 pinctrl from seine-pdx201 to sm6125
        arm64: dts: qcom: sm6125: Append -state suffix to pinctrl nodes
        arm64: dts: qcom: sm8250: add missing PCIe PHY clock-cells
        arm64: dts: mt7622: fix BPI-R64 WPS button
        arm64: tegra: Fixup SYSRAM references
        arm64: tegra: Update Tegra234 BPMP channel addresses
        arm64: tegra: Mark BPMP channels as no-memory-wc
        arm64: tegra: Fix SDMMC1 CD on P2888
        erofs: avoid consecutive detection for Highmem memory
        blk-mq: don't create hctx debugfs dir until q->debugfs_dir is created
        spi: Fix simplification of devm_spi_register_controller
        spi: tegra20-slink: fix UAF in tegra_slink_remove()
        hwmon: (drivetemp) Add module alias
        blktrace: Trace remapped requests correctly
        PM: domains: Ensure genpd_debugfs_dir exists before remove
        dm writecache: return void from functions
        dm writecache: count number of blocks read, not number of read bios
        dm writecache: count number of blocks written, not number of write bios
        dm writecache: count number of blocks discarded, not number of discard bios
        regulator: of: Fix refcount leak bug in of_get_regulation_constraints()
        soc: qcom: Make QCOM_RPMPD depend on PM
        arm64: dts: qcom: qcs404: Fix incorrect USB2 PHYs assignment
        irqdomain: Report irq number for NOMAP domains
        drivers/perf: arm_spe: Fix consistency of SYS_PMSCR_EL1.CX
        nohz/full, sched/rt: Fix missed tick-reenabling bug in dequeue_task_rt()
        x86/extable: Fix ex_handler_msr() print condition
        selftests/seccomp: Fix compile warning when CC=clang
        thermal/tools/tmon: Include pthread and time headers in tmon.h
        dm: return early from dm_pr_call() if DM device is suspended
        pwm: sifive: Simplify offset calculation for PWMCMP registers
        pwm: sifive: Ensure the clk is enabled exactly once per running PWM
        pwm: sifive: Shut down hardware only after pwmchip_remove() completed
        pwm: lpc18xx-sct: Reduce number of devm memory allocations
        pwm: lpc18xx-sct: Simplify driver by not using pwm_[gs]et_chip_data()
        pwm: lpc18xx: Fix period handling
        drm/dp: Export symbol / kerneldoc fixes for DP AUX bus
        drm/bridge: tc358767: Move (e)DP bridge endpoint parsing into dedicated function
        ath10k: do not enforce interrupt trigger type
        drm/st7735r: Fix module autoloading for Okaya RH128128T
        drm/panel: Fix build error when CONFIG_DRM_PANEL_SAMSUNG_ATNA33XC20=y && CONFIG_DRM_DISPLAY_HELPER=m
        wifi: rtlwifi: fix error codes in rtl_debugfs_set_write_h2c()
        ath11k: fix netdev open race
        drm/mipi-dbi: align max_chunk to 2 in spi_transfer
        ath11k: Fix incorrect debug_mask mappings
        drm/radeon: fix potential buffer overflow in ni_set_mc_special_registers()
        drm/mediatek: Modify dsi funcs to atomic operations
        drm/mediatek: Separate poweron/poweroff from enable/disable and define new funcs
        drm/mediatek: Add pull-down MIPI operation in mtk_dsi_poweroff function
        drm/meson: encoder_hdmi: switch to bridge DRM_BRIDGE_ATTACH_NO_CONNECTOR
        drm/meson: encoder_hdmi: Fix refcount leak in meson_encoder_hdmi_init
        drm/bridge: lt9611uxc: Cancel only driver's work
        i2c: npcm: Remove own slave addresses 2:10
        i2c: npcm: Correct slave role behavior
        i2c: mxs: Silence a clang warning
        virtio-gpu: fix a missing check to avoid NULL dereference
        drm/shmem-helper: Unexport drm_gem_shmem_create_with_handle()
        drm/shmem-helper: Export dedicated wrappers for GEM object functions
        drm/shmem-helper: Pass GEM shmem object in public interfaces
        drm/virtio: Fix NULL vs IS_ERR checking in virtio_gpu_object_shmem_init
        drm: adv7511: override i2c address of cec before accessing it
        crypto: sun8i-ss - do not allocate memory when handling hash requests
        crypto: sun8i-ss - fix error codes in allocate_flows()
        net: fix sk_wmem_schedule() and sk_rmem_schedule() errors
        can: netlink: allow configuring of fixed bit rates without need for do_set_bittiming callback
        can: netlink: allow configuring of fixed data bit rates without need for do_set_data_bittiming callback
        i2c: Fix a potential use after free
        crypto: sun8i-ss - fix infinite loop in sun8i_ss_setup_ivs()
        media: atmel: atmel-sama7g5-isc: fix warning in configs without OF
        media: tw686x: Register the irq at the end of probe
        media: imx-jpeg: Correct some definition according specification
        media: imx-jpeg: Leave a blank space before the configuration data
        media: imx-jpeg: Add pm-runtime support for imx-jpeg
        media: imx-jpeg: use NV12M to represent non contiguous NV12
        media: imx-jpeg: Set V4L2_BUF_FLAG_LAST at eos
        media: imx-jpeg: Refactor function mxc_jpeg_parse
        media: imx-jpeg: Identify and handle precision correctly
        media: imx-jpeg: Handle source change in a function
        media: imx-jpeg: Support dynamic resolution change
        media: imx-jpeg: Align upwards buffer size
        media: imx-jpeg: Implement drain using v4l2-mem2mem helpers
        ath9k: fix use-after-free in ath9k_hif_usb_rx_cb
        wifi: iwlegacy: 4965: fix potential off-by-one overflow in il4965_rs_fill_link_cmd()
        drm/radeon: fix incorrrect SPDX-License-Identifiers
        rcutorture: Warn on individual rcu_torture_init() error conditions
        rcutorture: Don't cpuhp_remove_state() if cpuhp_setup_state() failed
        rcutorture: Fix ksoftirqd boosting timing and iteration
        test_bpf: fix incorrect netdev features
        crypto: ccp - During shutdown, check SEV data pointer before using
        drm: bridge: adv7511: Add check for mipi_dsi_driver_register
        media: imx-jpeg: Disable slot interrupt when frame done
        drm/mcde: Fix refcount leak in mcde_dsi_bind
        media: hdpvr: fix error value returns in hdpvr_read
        media: v4l2-mem2mem: prevent pollerr when last_buffer_dequeued is set
        media: driver/nxp/imx-jpeg: fix a unexpected return value problem
        media: tw686x: Fix memory leak in tw686x_video_init
        drm/vc4: plane: Remove subpixel positioning check
        drm/vc4: plane: Fix margin calculations for the right/bottom edges
        drm/bridge: Add a function to abstract away panels
        drm/vc4: dsi: Switch to devm_drm_of_get_bridge
        drm/vc4: Use of_device_get_match_data()
        drm/vc4: dsi: Release workaround buffer and DMA
        drm/vc4: dsi: Correct DSI divider calculations
        drm/vc4: dsi: Correct pixel order for DSI0
        drm/vc4: dsi: Register dsi0 as the correct vc4 encoder type
        drm/vc4: dsi: Fix dsi0 interrupt support
        drm/vc4: dsi: Add correct stop condition to vc4_dsi_encoder_disable iteration
        drm/vc4: hdmi: Fix HPD GPIO detection
        drm/vc4: hdmi: Avoid full hdmi audio fifo writes
        drm/vc4: hdmi: Reset HDMI MISC_CONTROL register
        drm/vc4: hdmi: Fix timings for interlaced modes
        drm/vc4: hdmi: Correct HDMI timing registers for interlaced modes
        crypto: arm64/gcm - Select AEAD for GHASH_ARM64_CE
        selftests/xsk: Destroy BPF resources only when ctx refcount drops to 0
        drm/rockchip: vop: Don't crash for invalid duplicate_state()
        drm/rockchip: Fix an error handling path rockchip_dp_probe()
        drm/mediatek: dpi: Remove output format of YUV
        drm/mediatek: dpi: Only enable dpi after the bridge is enabled
        drm: bridge: sii8620: fix possible off-by-one
        hinic: Use the bitmap API when applicable
        net: hinic: fix bug that ethtool get wrong stats
        net: hinic: avoid kernel hung in hinic_get_stats64()
        drm/msm/mdp5: Fix global state lock backoff
        crypto: hisilicon/sec - don't sleep when in softirq
        crypto: hisilicon - Kunpeng916 crypto driver don't sleep when in softirq
        media: platform: mtk-mdp: Fix mdp_ipi_comm structure alignment
        drm/msm: Avoid dirtyfb stalls on video mode displays (v2)
        drm/msm/dpu: Fix for non-visible planes
        mt76: mt76x02u: fix possible memory leak in __mt76x02u_mcu_send_msg
        mt76: mt7615: do not update pm stats in case of error
        ieee80211: add EHT 1K aggregation definitions
        mt76: mt7921: fix aggregation subframes setting to HE max
        mt76: mt7921: enlarge maximum VHT MPDU length to 11454
        mediatek: mt76: mac80211: Fix missing of_node_put() in mt76_led_init()
        mediatek: mt76: eeprom: fix missing of_node_put() in mt76_find_power_limits_node()
        skmsg: Fix invalid last sg check in sk_msg_recvmsg()
        drm/exynos/exynos7_drm_decon: free resources when clk_set_parent() failed.
        tcp: make retransmitted SKB fit into the send window
        libbpf: Fix the name of a reused map
        selftests: timers: valid-adjtimex: build fix for newer toolchains
        selftests: timers: clocksource-switch: fix passing errors from child
        bpf: Fix subprog names in stack traces.
        fs: check FMODE_LSEEK to control internal pipe splicing
        media: cedrus: h265: Fix flag name
        media: hantro: postproc: Fix motion vector space size
        media: hantro: Simplify postprocessor
        media: hevc: Embedded indexes in RPS
        media: staging: media: hantro: Fix typos
        wifi: wil6210: debugfs: fix info leak in wil_write_file_wmi()
        wifi: p54: Fix an error handling path in p54spi_probe()
        wifi: p54: add missing parentheses in p54_flush()
        selftests/bpf: fix a test for snprintf() overflow
        libbpf: fix an snprintf() overflow check
        can: pch_can: do not report txerr and rxerr during bus-off
        can: rcar_can: do not report txerr and rxerr during bus-off
        can: sja1000: do not report txerr and rxerr during bus-off
        can: hi311x: do not report txerr and rxerr during bus-off
        can: sun4i_can: do not report txerr and rxerr during bus-off
        can: kvaser_usb_hydra: do not report txerr and rxerr during bus-off
        can: kvaser_usb_leaf: do not report txerr and rxerr during bus-off
        can: usb_8dev: do not report txerr and rxerr during bus-off
        can: error: specify the values of data[5..7] of CAN error frames
        can: pch_can: pch_can_error(): initialize errc before using it
        Bluetooth: hci_intel: Add check for platform_driver_register
        i2c: cadence: Support PEC for SMBus block read
        i2c: mux-gpmux: Add of_node_put() when breaking out of loop
        wifi: wil6210: debugfs: fix uninitialized variable use in `wil_write_file_wmi()`
        wifi: iwlwifi: mvm: fix double list_add at iwl_mvm_mac_wake_tx_queue
        wifi: libertas: Fix possible refcount leak in if_usb_probe()
        media: cedrus: hevc: Add check for invalid timestamp
        net/mlx5e: Remove WARN_ON when trying to offload an unsupported TLS cipher/version
        net/mlx5e: Fix the value of MLX5E_MAX_RQ_NUM_MTTS
        net/mlx5: Adjust log_max_qp to be 18 at most
        crypto: hisilicon/hpre - don't use GFP_KERNEL to alloc mem during softirq
        crypto: inside-secure - Add missing MODULE_DEVICE_TABLE for of
        crypto: hisilicon/sec - fix auth key size error
        inet: add READ_ONCE(sk->sk_bound_dev_if) in INET_MATCH()
        ipv6: add READ_ONCE(sk->sk_bound_dev_if) in INET6_MATCH()
        net: allow unbound socket for packets in VRF when tcp_l3mdev_accept set
        netdevsim: fib: Fix reference count leak on route deletion failure
        wifi: rtw88: check the return value of alloc_workqueue()
        iavf: Fix max_rate limiting
        iavf: Fix 'tc qdisc show' listing too many queues
        netdevsim: Avoid allocation warnings triggered from user space
        net: rose: fix netdev reference changes
        net: ionic: fix error check for vlan flags in ionic_set_nic_features()
        dccp: put dccp_qpolicy_full() and dccp_qpolicy_push() in the same lock
        net: usb: make USB_RTL8153_ECM non user configurable
        wireguard: ratelimiter: use hrtimer in selftest
        wireguard: allowedips: don't corrupt stack when detecting overflow
        HID: amd_sfh: Don't show client init failed as error when discovery fails
        clk: renesas: r9a06g032: Fix UART clkgrp bitsel
        mtd: maps: Fix refcount leak in of_flash_probe_versatile
        mtd: maps: Fix refcount leak in ap_flash_init
        mtd: rawnand: meson: Fix a potential double free issue
        of: check previous kernel's ima-kexec-buffer against memory bounds
        scsi: qla2xxx: edif: Reduce Initiator-Initiator thrashing
        scsi: qla2xxx: edif: Fix potential stuck session in sa update
        scsi: qla2xxx: edif: Reduce connection thrash
        scsi: qla2xxx: edif: Fix inconsistent check of db_flags
        scsi: qla2xxx: edif: Synchronize NPIV deletion with authentication application
        scsi: qla2xxx: edif: Add retry for ELS passthrough
        scsi: qla2xxx: edif: Fix n2n discovery issue with secure target
        scsi: qla2xxx: edif: Fix n2n login retry for secure device
        KVM: SVM: Unwind "speculative" RIP advancement if INTn injection "fails"
        KVM: SVM: Stuff next_rip on emulated INT3 injection if NRIPS is supported
        phy: samsung: exynosautov9-ufs: correct TSRV register configurations
        PCI: microchip: Fix refcount leak in mc_pcie_init_irq_domains()
        PCI: tegra194: Fix PM error handling in tegra_pcie_config_ep()
        HID: cp2112: prevent a buffer overflow in cp2112_xfer()
        mtd: sm_ftl: Fix deadlock caused by cancel_work_sync in sm_release
        mtd: partitions: Fix refcount leak in parse_redboot_of
        mtd: parsers: ofpart: Fix refcount leak in bcm4908_partitions_fw_offset
        mtd: st_spi_fsm: Add a clk_disable_unprepare() in .probe()'s error path
        PCI: mediatek-gen3: Fix refcount leak in mtk_pcie_init_irq_domains()
        fpga: altera-pr-ip: fix unsigned comparison with less than zero
        usb: host: Fix refcount leak in ehci_hcd_ppc_of_probe
        usb: ohci-nxp: Fix refcount leak in ohci_hcd_nxp_probe
        usb: gadget: tegra-xudc: Fix error check in tegra_xudc_powerdomain_init()
        usb: xhci: tegra: Fix error check
        netfilter: xtables: Bring SPDX identifier back
        scsi: qla2xxx: edif: Send LOGO for unexpected IKE message
        scsi: qla2xxx: edif: Reduce disruption due to multiple app start
        scsi: qla2xxx: edif: Fix no login after app start
        scsi: qla2xxx: edif: Tear down session if keys have been removed
        scsi: qla2xxx: edif: Fix session thrash
        scsi: qla2xxx: edif: Fix no logout on delete for N2N
        iio: accel: bma400: Fix the scale min and max macro values
        platform/chrome: cros_ec: Always expose last resume result
        iio: accel: bma400: Reordering of header files
        clk: mediatek: reset: Fix written reset bit offset
        lib/test_hmm: avoid accessing uninitialized pages
        memremap: remove support for external pgmap refcounts
        mm/memremap: fix memunmap_pages() race with get_dev_pagemap()
        KVM: Don't set Accessed/Dirty bits for ZERO_PAGE
        mwifiex: Ignore BTCOEX events from the 88W8897 firmware
        mwifiex: fix sleep in atomic context bugs caused by dev_coredumpv
        scsi: iscsi: Allow iscsi_if_stop_conn() to be called from kernel
        scsi: iscsi: Add helper to remove a session from the kernel
        scsi: iscsi: Fix session removal on shutdown
        dmaengine: dw-edma: Fix eDMA Rd/Wr-channels and DMA-direction semantics
        mtd: dataflash: Add SPI ID table
        clk: qcom: camcc-sm8250: Fix halt on boot by reducing driver's init level
        misc: rtsx: Fix an error handling path in rtsx_pci_probe()
        driver core: fix potential deadlock in __driver_attach
        clk: qcom: clk-krait: unlock spin after mux completion
        clk: qcom: gcc-msm8939: Add missing SYSTEM_MM_NOC_BFDCD_CLK_SRC
        clk: qcom: gcc-msm8939: Fix bimc_ddr_clk_src rcgr base address
        clk: qcom: gcc-msm8939: Add missing system_mm_noc_bfdcd_clk_src
        clk: qcom: gcc-msm8939: Point MM peripherals to system_mm_noc clock
        usb: host: xhci: use snprintf() in xhci_decode_trb()
        RDMA/rxe: Fix deadlock in rxe_do_local_ops()
        clk: qcom: ipq8074: fix NSS core PLL-s
        clk: qcom: ipq8074: SW workaround for UBI32 PLL lock
        clk: qcom: ipq8074: fix NSS port frequency tables
        clk: qcom: ipq8074: set BRANCH_HALT_DELAY flag for UBI clocks
        clk: qcom: camcc-sdm845: Fix topology around titan_top power domain
        clk: qcom: camcc-sm8250: Fix topology around titan_top power domain
        clk: qcom: clk-rcg2: Fail Duty-Cycle configuration if MND divider is not enabled.
        clk: qcom: clk-rcg2: Make sure to not write d=0 to the NMD register
        mm/mempolicy: fix get_nodes out of bound access
        PCI: dwc: Stop link on host_init errors and de-initialization
        PCI: dwc: Add unroll iATU space support to dw_pcie_disable_atu()
        PCI: dwc: Disable outbound windows only for controllers using iATU
        PCI: dwc: Set INCREASE_REGION_SIZE flag based on limit address
        PCI: dwc: Deallocate EPC memory on dw_pcie_ep_init() errors
        PCI: dwc: Always enable CDM check if "snps,enable-cdm-check" exists
        soundwire: bus_type: fix remove and shutdown support
        soundwire: revisit driver bind/unbind and callbacks
        KVM: arm64: Don't return from void function
        dmaengine: sf-pdma: Add multithread support for a DMA channel
        PCI: endpoint: Don't stop controller when unbinding endpoint function
        scsi: qla2xxx: Check correct variable in qla24xx_async_gffid()
        intel_th: Fix a resource leak in an error handling path
        intel_th: msu-sink: Potential dereference of null pointer
        intel_th: msu: Fix vmalloced buffers
        binder: fix redefinition of seq_file attributes
        staging: rtl8192u: Fix sleep in atomic context bug in dm_fsync_timer_callback
        mmc: sdhci-of-esdhc: Fix refcount leak in esdhc_signal_voltage_switch
        mmc: mxcmmc: Silence a clang warning
        mmc: renesas_sdhi: Get the reset handle early in the probe
        memstick/ms_block: Fix some incorrect memory allocation
        memstick/ms_block: Fix a memory leak
        mmc: sdhci-of-at91: fix set_uhs_signaling rewriting of MC1R
        of: device: Fix missing of_node_put() in of_dma_set_restricted_buffer
        mmc: block: Add single read for 4k sector cards
        KVM: s390: pv: leak the topmost page table when destroy fails
        PCI/portdrv: Don't disable AER reporting in get_port_device_capability()
        PCI: qcom: Set up rev 2.1.0 PARF_PHY before enabling clocks
        scsi: smartpqi: Fix DMA direction for RAID requests
        xtensa: iss/network: provide release() callback
        xtensa: iss: fix handling error cases in iss_net_configure()
        usb: gadget: udc: amd5536 depends on HAS_DMA
        usb: aspeed-vhub: Fix refcount leak bug in ast_vhub_init_desc()
        usb: dwc3: core: Deprecate GCTL.CORESOFTRESET
        usb: dwc3: core: Do not perform GCTL_CORE_SOFTRESET during bootup
        usb: dwc3: qcom: fix missing optional irq warnings
        eeprom: idt_89hpesx: uninitialized data in idt_dbgfs_csr_write()
        phy: stm32: fix error return in stm32_usbphyc_phy_init
        interconnect: imx: fix max_node_id
        um: random: Don't initialise hwrng struct with zero
        RDMA/irdma: Fix a window for use-after-free
        RDMA/irdma: Fix VLAN connection with wildcard address
        RDMA/irdma: Fix setting of QP context err_rq_idx_valid field
        RDMA/rtrs-srv: Fix modinfo output for stringify
        RDMA/rtrs: Fix warning when use poll mode on client side.
        RDMA/rtrs: Replace duplicate check with is_pollqueue helper
        RDMA/rtrs: Introduce destroy_cq helper
        RDMA/rtrs: Do not allow sessname to contain special symbols / and .
        RDMA/rtrs: Rename rtrs_sess to rtrs_path
        RDMA/rtrs-srv: Rename rtrs_srv_sess to rtrs_srv_path
        RDMA/rtrs-clt: Rename rtrs_clt_sess to rtrs_clt_path
        RDMA/rtrs-clt: Replace list_next_or_null_rr_rcu with an inline function
        RDMA/qedr: Fix potential memory leak in __qedr_alloc_mr()
        RDMA/hns: Fix incorrect clearing of interrupt status register
        RDMA/siw: Fix duplicated reported IW_CM_EVENT_CONNECT_REPLY event
        iio: cros: Register FIFO callback after sensor is registered
        clk: qcom: gcc-msm8939: Fix weird field spacing in ftbl_gcc_camss_cci_clk
        RDMA/hfi1: fix potential memory leak in setup_base_ctxt()
        gpio: gpiolib-of: Fix refcount bugs in of_mm_gpiochip_add_data()
        HID: mcp2221: prevent a buffer overflow in mcp_smbus_write()
        HID: amd_sfh: Add NULL check for hid device
        dmaengine: imx-dma: Cast of_device_get_match_data() with (uintptr_t)
        scripts/gdb: lx-dmesg: read records individually
        scripts/gdb: fix 'lx-dmesg' on 32 bits arch
        RDMA/rxe: Fix mw bind to allow any consumer key portion
        mmc: cavium-octeon: Add of_node_put() when breaking out of loop
        mmc: cavium-thunderx: Add of_node_put() when breaking out of loop
        HID: alps: Declare U1_UNICORN_LEGACY support
        RDMA/rxe: For invalidate compare according to set keys in mr
        PCI: tegra194: Fix Root Port interrupt handling
        PCI: tegra194: Fix link up retry sequence
        HID: amd_sfh: Handle condition of "no sensors"
        USB: serial: fix tty-port initialized comments
        usb: cdns3: change place of 'priv_ep' assignment in cdns3_gadget_ep_dequeue(), cdns3_gadget_ep_enable()
        mtd: spi-nor: fix spi_nor_spimem_setup_op() call in spi_nor_erase_{sector,chip}()
        KVM: nVMX: Set UMIP bit CR4_FIXED1 MSR when emulating UMIP
        platform/olpc: Fix uninitialized data in debugfs write
        RDMA/srpt: Duplicate port name members
        RDMA/srpt: Introduce a reference count in struct srpt_device
        RDMA/srpt: Fix a use-after-free
        android: binder: stop saving a pointer to the VMA
        mm/mmap.c: fix missing call to vm_unacct_memory in mmap_region
        selftests: kvm: set rax before vmcall
        of/fdt: declared return type does not match actual return type
        RDMA/mlx5: Add missing check for return value in get namespace flow
        RDMA/rxe: Add memory barriers to kernel queues
        RDMA/rxe: Remove the is_user members of struct rxe_sq/rxe_rq/rxe_srq
        RDMA/rxe: Fix error unwind in rxe_create_qp()
        block/rnbd-srv: Set keep_id to true after mutex_trylock
        null_blk: fix ida error handling in null_add_dev()
        nvme: use command_id instead of req->tag in trace_nvme_complete_rq()
        nvme: define compat_ioctl again to unbreak 32-bit userspace.
        nvme: disable namespace access for unsupported metadata
        nvme: don't return an error from nvme_configure_metadata
        nvme: catch -ENODEV from nvme_revalidate_zones again
        block/bio: remove duplicate append pages code
        block: ensure iov_iter advances for added pages
        jbd2: fix outstanding credits assert in jbd2_journal_commit_transaction()
        ext4: recover csum seed of tmp_inode after migrating to extents
        jbd2: fix assertion 'jh->b_frozen_data == NULL' failure when journal aborted
        usb: cdns3: Don't use priv_dev uninitialized in cdns3_gadget_ep_enable()
        opp: Fix error check in dev_pm_opp_attach_genpd()
        ASoC: cros_ec_codec: Fix refcount leak in cros_ec_codec_platform_probe
        ASoC: samsung: Fix error handling in aries_audio_probe
        ASoC: imx-audmux: Silence a clang warning
        ASoC: mediatek: mt8173: Fix refcount leak in mt8173_rt5650_rt5676_dev_probe
        ASoC: mt6797-mt6351: Fix refcount leak in mt6797_mt6351_dev_probe
        ASoC: codecs: da7210: add check for i2c_add_driver
        ASoC: mediatek: mt8173-rt5650: Fix refcount leak in mt8173_rt5650_dev_probe
        serial: 8250: Export ICR access helpers for internal use
        serial: 8250: dma: Allow driver operations before starting DMA transfers
        serial: 8250_dw: Store LSR into lsr_saved_flags in dw8250_tx_wait_empty()
        ASoC: codecs: msm8916-wcd-digital: move gains from SX_TLV to S8_TLV
        ASoC: codecs: wcd9335: move gains from SX_TLV to S8_TLV
        rpmsg: char: Add mutex protection for rpmsg_eptdev_open()
        rpmsg: mtk_rpmsg: Fix circular locking dependency
        remoteproc: k3-r5: Fix refcount leak in k3_r5_cluster_of_init
        selftests/livepatch: better synchronize test_klp_callbacks_busy
        profiling: fix shift too large makes kernel panic
        remoteproc: imx_rproc: Fix refcount leak in imx_rproc_addr_init
        ASoC: samsung: h1940_uda1380: include proepr GPIO consumer header
        powerpc/perf: Optimize clearing the pending PMI and remove WARN_ON for PMI check in power_pmu_disable
        ASoC: samsung: change gpiod_speaker_power and rx1950_audio from global to static variables
        tty: n_gsm: Delete gsmtty open SABM frame when config requester
        tty: n_gsm: fix user open not possible at responder until initiator open
        tty: n_gsm: fix tty registration before control channel open
        tty: n_gsm: fix wrong queuing behavior in gsm_dlci_data_output()
        tty: n_gsm: fix missing timer to handle stalled links
        tty: n_gsm: fix non flow control frames during mux flow off
        tty: n_gsm: fix packet re-transmission without open control channel
        tty: n_gsm: fix race condition in gsmld_write()
        tty: n_gsm: fix resource allocation order in gsm_activate_mux()
        ASoC: qcom: Fix missing of_node_put() in asoc_qcom_lpass_cpu_platform_probe()
        ASoC: imx-card: Fix DSD/PDM mclk frequency
        remoteproc: qcom: wcnss: Fix handling of IRQs
        vfio/ccw: Do not change FSM state in subchannel event
        serial: 8250_fsl: Don't report FE, PE and OE twice
        tty: n_gsm: fix wrong T1 retry count handling
        tty: n_gsm: fix DM command
        tty: n_gsm: fix missing corner cases in gsmld_poll()
        MIPS: vdso: Utilize __pa() for gic_pfn
        swiotlb: fail map correctly with failed io_tlb_default_mem
        ASoC: mt6359: Fix refcount leak bug
        serial: 8250_bcm7271: Save/restore RTS in suspend/resume
        iommu/exynos: Handle failed IOMMU device registration properly
        9p: fix a bunch of checkpatch warnings
        9p: Drop kref usage
        9p: Add client parameter to p9_req_put()
        net: 9p: fix refcount leak in p9_read_work() error handling
        MIPS: Fixed __debug_virt_addr_valid()
        rpmsg: qcom_smd: Fix refcount leak in qcom_smd_parse_edge
        kfifo: fix kfifo_to_user() return type
        lib/smp_processor_id: fix imbalanced instrumentation_end() call
        proc: fix a dentry lock race between release_task and lookup
        remoteproc: qcom: pas: Check if coredump is enabled
        remoteproc: sysmon: Wait for SSCTL service to come up
        mfd: t7l66xb: Drop platform disable callback
        mfd: max77620: Fix refcount leak in max77620_initialise_fps
        iommu/arm-smmu: qcom_iommu: Add of_node_put() when breaking out of loop
        perf tools: Fix dso_id inode generation comparison
        s390/dump: fix old lowcore virtual vs physical address confusion
        s390/maccess: fix semantics of memcpy_real() and its callers
        s390/crash: fix incorrect number of bytes to copy to user space
        s390/zcore: fix race when reading from hardware system area
        ASoC: fsl_asrc: force cast the asrc_format type
        ASoC: fsl-asoc-card: force cast the asrc_format type
        ASoC: fsl_easrc: use snd_pcm_format_t type for sample_format
        ASoC: imx-card: use snd_pcm_format_t type for asrc_format
        ASoC: qcom: q6dsp: Fix an off-by-one in q6adm_alloc_copp()
        fuse: Remove the control interface for virtio-fs
        ASoC: audio-graph-card: Add of_node_put() in fail path
        watchdog: sp5100_tco: Fix a memory leak of EFCH MMIO resource
        watchdog: armada_37xx_wdt: check the return value of devm_ioremap() in armada_37xx_wdt_probe()
        video: fbdev: amba-clcd: Fix refcount leak bugs
        video: fbdev: sis: fix typos in SiS_GetModeID()
        ASoC: mchp-spdifrx: disable end of block interrupt on failures
        powerpc/32: Call mmu_mark_initmem_nx() regardless of data block mapping.
        powerpc/32: Do not allow selection of e5500 or e6500 CPUs on PPC32
        powerpc/iommu: Fix iommu_table_in_use for a small default DMA window case
        powerpc/pci: Prefer PCI domain assignment via DT 'linux,pci-domain' and alias
        tty: serial: fsl_lpuart: correct the count of break characters
        s390/dump: fix os_info virtual vs physical address confusion
        s390/smp: cleanup target CPU callback starting
        s390/smp: cleanup control register update routines
        s390/maccess: rework absolute lowcore accessors
        s390/smp: enforce lowcore protection on CPU restart
        f2fs: fix to remove F2FS_COMPR_FL and tag F2FS_NOCOMP_FL at the same time
        powerpc/spufs: Fix refcount leak in spufs_init_isolated_loader
        powerpc/xive: Fix refcount leak in xive_get_max_prio
        powerpc/cell/axon_msi: Fix refcount leak in setup_msi_msg_address
        perf symbol: Fail to read phdr workaround
        kprobes: Forbid probing on trampoline and BPF code areas
        x86/bus_lock: Don't assume the init value of DEBUGCTLMSR.BUS_LOCK_DETECT to be zero
        powerpc/pci: Fix PHB numbering when using opal-phbid
        genelf: Use HAVE_LIBCRYPTO_SUPPORT, not the never defined HAVE_LIBCRYPTO
        scripts/faddr2line: Fix vmlinux detection on arm64
        sched/deadline: Merge dl_task_can_attach() and dl_cpu_busy()
        sched, cpuset: Fix dl_cpu_busy() panic due to empty cs->cpus_allowed
        x86/numa: Use cpumask_available instead of hardcoded NULL check
        video: fbdev: arkfb: Fix a divide-by-zero bug in ark_set_pixclock()
        tools/thermal: Fix possible path truncations
        sched: Fix the check of nr_running at queue wakelist
        sched: Remove the limitation of WF_ON_CPU on wakelist if wakee cpu is idle
        sched/core: Do not requeue task on CPU excluded from cpus_mask
        x86/entry: Build thunk_$(BITS) only if CONFIG_PREEMPTION=y
        f2fs: allow compression for mmap files in compress_mode=user
        f2fs: do not allow to decompress files have FI_COMPRESS_RELEASED
        video: fbdev: vt8623fb: Check the size of screen before memset_io()
        video: fbdev: arkfb: Check the size of screen before memset_io()
        video: fbdev: s3fb: Check the size of screen before memset_io()
        scsi: ufs: core: Correct ufshcd_shutdown() flow
        scsi: zfcp: Fix missing auto port scan and thus missing target ports
        scsi: qla2xxx: Fix imbalance vha->vref_count
        scsi: qla2xxx: Fix discovery issues in FC-AL topology
        scsi: qla2xxx: Turn off multi-queue for 8G adapters
        scsi: qla2xxx: Fix crash due to stale SRB access around I/O timeouts
        scsi: qla2xxx: Fix excessive I/O error messages by default
        scsi: qla2xxx: Fix erroneous mailbox timeout after PCI error injection
        scsi: qla2xxx: Wind down adapter after PCIe error
        scsi: qla2xxx: Fix losing FCP-2 targets on long port disable with I/Os
        scsi: qla2xxx: Fix losing target when it reappears during delete
        scsi: qla2xxx: Fix losing FCP-2 targets during port perturbation tests
        x86/bugs: Enable STIBP for IBPB mitigated RETBleed
        ftrace/x86: Add back ftrace_expected assignment
        x86/kprobes: Update kcb status flag after singlestepping
        x86/olpc: fix 'logical not is only applied to the left hand side'
        SMB3: fix lease break timeout when multiple deferred close handles for the same file.
        posix-cpu-timers: Cleanup CPU timers before freeing them during exec
        Input: gscps2 - check return value of ioremap() in gscps2_probe()
        __follow_mount_rcu(): verify that mount_lock remains unchanged
        spmi: trace: fix stack-out-of-bound access in SPMI tracing functions
        drm/mediatek: Allow commands to be sent during video mode
        drm/mediatek: Keep dsi as LP00 before dcs cmds transfer
        crypto: blake2s - remove shash module
        drm/dp/mst: Read the extended DPCD capabilities during system resume
        drm/vc4: drv: Adopt the dma configuration from the HVS or V3D component
        usbnet: smsc95xx: Don't clear read-only PHY interrupt
        usbnet: smsc95xx: Avoid link settings race on interrupt reception
        usbnet: smsc95xx: Forward PHY interrupts to PHY driver to avoid polling
        usbnet: smsc95xx: Fix deadlock on runtime resume
        firmware: arm_scpi: Ensure scpi_info is not assigned if the probe fails
        scsi: lpfc: Fix EEH support for NVMe I/O
        scsi: lpfc: SLI path split: Refactor lpfc_iocbq
        scsi: lpfc: SLI path split: Refactor fast and slow paths to native SLI4
        scsi: lpfc: SLI path split: Refactor SCSI paths
        scsi: lpfc: Remove extra atomic_inc on cmd_pending in queuecommand after VMID
        intel_th: pci: Add Meteor Lake-P support
        intel_th: pci: Add Raptor Lake-S PCH support
        intel_th: pci: Add Raptor Lake-S CPU support
        KVM: set_msr_mce: Permit guests to ignore single-bit ECC errors
        KVM: x86: Signal #GP, not -EPERM, on bad WRMSR(MCi_CTL/STATUS)
        iommu/vt-d: avoid invalid memory access via node_online(NUMA_NO_NODE)
        PCI/AER: Iterate over error counters instead of error strings
        PCI: qcom: Power on PHY before IPQ8074 DBI register accesses
        serial: 8250_pci: Refactor the loop in pci_ite887x_init()
        serial: 8250_pci: Replace dev_*() by pci_*() macros
        serial: 8250: Fold EndRun device support into OxSemi Tornado code
        serial: 8250: Add proper clock handling for OxSemi PCIe devices
        tty: 8250: Add support for Brainboxes PX cards.
        dm writecache: set a default MAX_WRITEBACK_JOBS
        kexec, KEYS, s390: Make use of built-in and secondary keyring for signature verification
        dm thin: fix use-after-free crash in dm_sm_register_threshold_callback
        net/9p: Initialize the iounit field during fid creation
        ARM: remove some dead code
        timekeeping: contribute wall clock to rng on time change
        locking/csd_lock: Change csdlock_debug from early_param to __setup
        block: remove the struct blk_queue_ctx forward declaration
        block: don't allow the same type rq_qos add more than once
        btrfs: ensure pages are unlocked on cow_file_range() failure
        btrfs: reset block group chunk force if we have to wait
        btrfs: properly flag filesystem with BTRFS_FEATURE_INCOMPAT_BIG_METADATA
        ACPI: CPPC: Do not prevent CPPC from working in the future
        powerpc/powernv/kvm: Use darn for H_RANDOM on Power9
        KVM: x86/pmu: Introduce the ctrl_mask value for fixed counter
        KVM: VMX: Mark all PERF_GLOBAL_(OVF)_CTRL bits reserved if there's no vPMU
        KVM: x86/pmu: Ignore pmu->global_ctrl check if vPMU doesn't support global_ctrl
        KVM: VMX: Add helper to check if the guest PMU has PERF_GLOBAL_CTRL
        KVM: nVMX: Attempt to load PERF_GLOBAL_CTRL on nVMX xfer iff it exists
        dm raid: fix address sanitizer warning in raid_status
        dm raid: fix address sanitizer warning in raid_resume
        tracing: Add '__rel_loc' using trace event macros
        tracing: Avoid -Warray-bounds warning for __rel_loc macro
        ext4: update s_overhead_clusters in the superblock during an on-line resize
        ext4: fix extent status tree race in writeback error recovery path
        ext4: add EXT4_INODE_HAS_XATTR_SPACE macro in xattr.h
        ext4: fix use-after-free in ext4_xattr_set_entry
        ext4: correct max_inline_xattr_value_size computing
        ext4: correct the misjudgment in ext4_iget_extra_inode
        ext4: fix warning in ext4_iomap_begin as race between bmap and write
        ext4: check if directory block is within i_size
        ext4: make sure ext4_append() always allocates new block
        ext4: remove EA inode entry from mbcache on inode eviction
        ext4: use kmemdup() to replace kmalloc + memcpy
        ext4: unindent codeblock in ext4_xattr_block_set()
        ext4: fix race when reusing xattr blocks
        KEYS: asymmetric: enforce SM2 signature use pkey algo
        tpm: eventlog: Fix section mismatch for DEBUG_SECTION_MISMATCH
        xen-blkback: fix persistent grants negotiation
        xen-blkback: Apply 'feature_persistent' parameter when connect
        xen-blkfront: Apply 'feature_persistent' parameter when connect
        powerpc: Fix eh field when calling lwarx on PPC32
        tracing: Use a struct alignof to determine trace event field alignment
        net_sched: cls_route: remove from list when handle is 0
        mac80211: fix a memory leak where sta_info is not freed
        tcp: fix over estimation in sk_forced_mem_schedule()
        crypto: lib/blake2s - reduce stack frame usage in self test
        Revert "mwifiex: fix sleep in atomic context bugs caused by dev_coredumpv"
        Revert "s390/smp: enforce lowcore protection on CPU restart"
        drm/bridge: tc358767: Fix (e)DP bridge endpoint parsing in dedicated function
        net: phy: smsc: Disable Energy Detect Power-Down in interrupt mode
        drm/vc4: change vc4_dma_range_matches from a global to static
        tracing/perf: Avoid -Warray-bounds warning for __rel_loc macro
        drm/msm: Fix dirtyfb refcounting
        drm/meson: Fix refcount leak in meson_encoder_hdmi_init
        io_uring: mem-account pbuf buckets
        Revert "net: usb: ax88179_178a needs FLAG_SEND_ZLP"
        Bluetooth: L2CAP: Fix l2cap_global_chan_by_psm regression
        drm/bridge: Move devm_drm_of_get_bridge to bridge/panel.c
        scsi: lpfc: Fix locking for lpfc_sli_iocbq_lookup()
        scsi: lpfc: Fix element offset in __lpfc_sli_release_iocbq_s4()
        scsi: lpfc: Resolve some cleanup issues following SLI path refactoring
        Linux 5.15.61

Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
Change-Id: Iec359ed301bcbcd6e19b67ee8534418fab26850b
2022-09-21 17:30:12 +02:00

3034 lines
75 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* Simple NUMA memory policy for the Linux kernel.
*
* Copyright 2003,2004 Andi Kleen, SuSE Labs.
* (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
*
* NUMA policy allows the user to give hints in which node(s) memory should
* be allocated.
*
* Support four policies per VMA and per process:
*
* The VMA policy has priority over the process policy for a page fault.
*
* interleave Allocate memory interleaved over a set of nodes,
* with normal fallback if it fails.
* For VMA based allocations this interleaves based on the
* offset into the backing object or offset into the mapping
* for anonymous memory. For process policy an process counter
* is used.
*
* bind Only allocate memory on a specific set of nodes,
* no fallback.
* FIXME: memory is allocated starting with the first node
* to the last. It would be better if bind would truly restrict
* the allocation to memory nodes instead
*
* preferred Try a specific node first before normal fallback.
* As a special case NUMA_NO_NODE here means do the allocation
* on the local CPU. This is normally identical to default,
* but useful to set in a VMA when you have a non default
* process policy.
*
* preferred many Try a set of nodes first before normal fallback. This is
* similar to preferred without the special case.
*
* default Allocate on the local node first, or when on a VMA
* use the process policy. This is what Linux always did
* in a NUMA aware kernel and still does by, ahem, default.
*
* The process policy is applied for most non interrupt memory allocations
* in that process' context. Interrupts ignore the policies and always
* try to allocate on the local CPU. The VMA policy is only applied for memory
* allocations for a VMA in the VM.
*
* Currently there are a few corner cases in swapping where the policy
* is not applied, but the majority should be handled. When process policy
* is used it is not remembered over swap outs/swap ins.
*
* Only the highest zone in the zone hierarchy gets policied. Allocations
* requesting a lower zone just use default policy. This implies that
* on systems with highmem kernel lowmem allocation don't get policied.
* Same with GFP_DMA allocations.
*
* For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
* all users and remembered even when nobody has memory mapped.
*/
/* Notebook:
fix mmap readahead to honour policy and enable policy for any page cache
object
statistics for bigpages
global policy for page cache? currently it uses process policy. Requires
first item above.
handle mremap for shared memory (currently ignored for the policy)
grows down?
make bind policy root only? It can trigger oom much faster and the
kernel is not always grateful with that.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/mempolicy.h>
#include <linux/pagewalk.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
#include <linux/nodemask.h>
#include <linux/cpuset.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/export.h>
#include <linux/nsproxy.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/compat.h>
#include <linux/ptrace.h>
#include <linux/swap.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/migrate.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/ctype.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/printk.h>
#include <linux/swapops.h>
#include <asm/tlbflush.h>
#include <linux/uaccess.h>
#include "internal.h"
/* Internal flags */
#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
static struct kmem_cache *policy_cache;
static struct kmem_cache *sn_cache;
/* Highest zone. An specific allocation for a zone below that is not
policied. */
enum zone_type policy_zone = 0;
/*
* run-time system-wide default policy => local allocation
*/
static struct mempolicy default_policy = {
.refcnt = ATOMIC_INIT(1), /* never free it */
.mode = MPOL_LOCAL,
};
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
/**
* numa_map_to_online_node - Find closest online node
* @node: Node id to start the search
*
* Lookup the next closest node by distance if @nid is not online.
*/
int numa_map_to_online_node(int node)
{
int min_dist = INT_MAX, dist, n, min_node;
if (node == NUMA_NO_NODE || node_online(node))
return node;
min_node = node;
for_each_online_node(n) {
dist = node_distance(node, n);
if (dist < min_dist) {
min_dist = dist;
min_node = n;
}
}
return min_node;
}
EXPORT_SYMBOL_GPL(numa_map_to_online_node);
struct mempolicy *get_task_policy(struct task_struct *p)
{
struct mempolicy *pol = p->mempolicy;
int node;
if (pol)
return pol;
node = numa_node_id();
if (node != NUMA_NO_NODE) {
pol = &preferred_node_policy[node];
/* preferred_node_policy is not initialised early in boot */
if (pol->mode)
return pol;
}
return &default_policy;
}
static const struct mempolicy_operations {
int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
} mpol_ops[MPOL_MAX];
static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
{
return pol->flags & MPOL_MODE_FLAGS;
}
static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
const nodemask_t *rel)
{
nodemask_t tmp;
nodes_fold(tmp, *orig, nodes_weight(*rel));
nodes_onto(*ret, tmp, *rel);
}
static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
if (nodes_empty(*nodes))
return -EINVAL;
pol->nodes = *nodes;
return 0;
}
static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
{
if (nodes_empty(*nodes))
return -EINVAL;
nodes_clear(pol->nodes);
node_set(first_node(*nodes), pol->nodes);
return 0;
}
/*
* mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
* any, for the new policy. mpol_new() has already validated the nodes
* parameter with respect to the policy mode and flags.
*
* Must be called holding task's alloc_lock to protect task's mems_allowed
* and mempolicy. May also be called holding the mmap_lock for write.
*/
static int mpol_set_nodemask(struct mempolicy *pol,
const nodemask_t *nodes, struct nodemask_scratch *nsc)
{
int ret;
/*
* Default (pol==NULL) resp. local memory policies are not a
* subject of any remapping. They also do not need any special
* constructor.
*/
if (!pol || pol->mode == MPOL_LOCAL)
return 0;
/* Check N_MEMORY */
nodes_and(nsc->mask1,
cpuset_current_mems_allowed, node_states[N_MEMORY]);
VM_BUG_ON(!nodes);
if (pol->flags & MPOL_F_RELATIVE_NODES)
mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
else
nodes_and(nsc->mask2, *nodes, nsc->mask1);
if (mpol_store_user_nodemask(pol))
pol->w.user_nodemask = *nodes;
else
pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
return ret;
}
/*
* This function just creates a new policy, does some check and simple
* initialization. You must invoke mpol_set_nodemask() to set nodes.
*/
static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
nodemask_t *nodes)
{
struct mempolicy *policy;
pr_debug("setting mode %d flags %d nodes[0] %lx\n",
mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
if (mode == MPOL_DEFAULT) {
if (nodes && !nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
return NULL;
}
VM_BUG_ON(!nodes);
/*
* MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
* MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
* All other modes require a valid pointer to a non-empty nodemask.
*/
if (mode == MPOL_PREFERRED) {
if (nodes_empty(*nodes)) {
if (((flags & MPOL_F_STATIC_NODES) ||
(flags & MPOL_F_RELATIVE_NODES)))
return ERR_PTR(-EINVAL);
mode = MPOL_LOCAL;
}
} else if (mode == MPOL_LOCAL) {
if (!nodes_empty(*nodes) ||
(flags & MPOL_F_STATIC_NODES) ||
(flags & MPOL_F_RELATIVE_NODES))
return ERR_PTR(-EINVAL);
} else if (nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!policy)
return ERR_PTR(-ENOMEM);
atomic_set(&policy->refcnt, 1);
policy->mode = mode;
policy->flags = flags;
return policy;
}
/* Slow path of a mpol destructor. */
void __mpol_put(struct mempolicy *p)
{
if (!atomic_dec_and_test(&p->refcnt))
return;
kmem_cache_free(policy_cache, p);
}
static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
{
}
static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
nodemask_t tmp;
if (pol->flags & MPOL_F_STATIC_NODES)
nodes_and(tmp, pol->w.user_nodemask, *nodes);
else if (pol->flags & MPOL_F_RELATIVE_NODES)
mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
else {
nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
*nodes);
pol->w.cpuset_mems_allowed = *nodes;
}
if (nodes_empty(tmp))
tmp = *nodes;
pol->nodes = tmp;
}
static void mpol_rebind_preferred(struct mempolicy *pol,
const nodemask_t *nodes)
{
pol->w.cpuset_mems_allowed = *nodes;
}
/*
* mpol_rebind_policy - Migrate a policy to a different set of nodes
*
* Per-vma policies are protected by mmap_lock. Allocations using per-task
* policies are protected by task->mems_allowed_seq to prevent a premature
* OOM/allocation failure due to parallel nodemask modification.
*/
static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
{
if (!pol || pol->mode == MPOL_LOCAL)
return;
if (!mpol_store_user_nodemask(pol) &&
nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
return;
mpol_ops[pol->mode].rebind(pol, newmask);
}
/*
* Wrapper for mpol_rebind_policy() that just requires task
* pointer, and updates task mempolicy.
*
* Called with task's alloc_lock held.
*/
void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
{
mpol_rebind_policy(tsk->mempolicy, new);
}
/*
* Rebind each vma in mm to new nodemask.
*
* Call holding a reference to mm. Takes mm->mmap_lock during call.
*/
void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
struct vm_area_struct *vma;
mmap_write_lock(mm);
for (vma = mm->mmap; vma; vma = vma->vm_next)
mpol_rebind_policy(vma->vm_policy, new);
mmap_write_unlock(mm);
}
static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
[MPOL_DEFAULT] = {
.rebind = mpol_rebind_default,
},
[MPOL_INTERLEAVE] = {
.create = mpol_new_nodemask,
.rebind = mpol_rebind_nodemask,
},
[MPOL_PREFERRED] = {
.create = mpol_new_preferred,
.rebind = mpol_rebind_preferred,
},
[MPOL_BIND] = {
.create = mpol_new_nodemask,
.rebind = mpol_rebind_nodemask,
},
[MPOL_LOCAL] = {
.rebind = mpol_rebind_default,
},
[MPOL_PREFERRED_MANY] = {
.create = mpol_new_nodemask,
.rebind = mpol_rebind_preferred,
},
};
static int migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags);
struct queue_pages {
struct list_head *pagelist;
unsigned long flags;
nodemask_t *nmask;
unsigned long start;
unsigned long end;
struct vm_area_struct *first;
};
/*
* Check if the page's nid is in qp->nmask.
*
* If MPOL_MF_INVERT is set in qp->flags, check if the nid is
* in the invert of qp->nmask.
*/
static inline bool queue_pages_required(struct page *page,
struct queue_pages *qp)
{
int nid = page_to_nid(page);
unsigned long flags = qp->flags;
return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
}
/*
* queue_pages_pmd() has four possible return values:
* 0 - pages are placed on the right node or queued successfully, or
* special page is met, i.e. huge zero page.
* 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
* specified.
* 2 - THP was split.
* -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
* existing page was already on a node that does not follow the
* policy.
*/
static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
unsigned long end, struct mm_walk *walk)
__releases(ptl)
{
int ret = 0;
struct page *page;
struct queue_pages *qp = walk->private;
unsigned long flags;
if (unlikely(is_pmd_migration_entry(*pmd))) {
ret = -EIO;
goto unlock;
}
page = pmd_page(*pmd);
if (is_huge_zero_page(page)) {
spin_unlock(ptl);
walk->action = ACTION_CONTINUE;
goto out;
}
if (!queue_pages_required(page, qp))
goto unlock;
flags = qp->flags;
/* go to thp migration */
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
if (!vma_migratable(walk->vma) ||
migrate_page_add(page, qp->pagelist, flags)) {
ret = 1;
goto unlock;
}
} else
ret = -EIO;
unlock:
spin_unlock(ptl);
out:
return ret;
}
/*
* Scan through pages checking if pages follow certain conditions,
* and move them to the pagelist if they do.
*
* queue_pages_pte_range() has three possible return values:
* 0 - pages are placed on the right node or queued successfully, or
* special page is met, i.e. zero page.
* 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
* specified.
* -EIO - only MPOL_MF_STRICT was specified and an existing page was already
* on a node that does not follow the policy.
*/
static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
struct page *page;
struct queue_pages *qp = walk->private;
unsigned long flags = qp->flags;
int ret;
bool has_unmovable = false;
pte_t *pte, *mapped_pte;
spinlock_t *ptl;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
if (ret != 2)
return ret;
}
/* THP was split, fall through to pte walk */
if (pmd_trans_unstable(pmd))
return 0;
mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE) {
if (!pte_present(*pte))
continue;
page = vm_normal_page(vma, addr, *pte);
if (!page)
continue;
/*
* vm_normal_page() filters out zero pages, but there might
* still be PageReserved pages to skip, perhaps in a VDSO.
*/
if (PageReserved(page))
continue;
if (!queue_pages_required(page, qp))
continue;
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
/* MPOL_MF_STRICT must be specified if we get here */
if (!vma_migratable(vma)) {
has_unmovable = true;
break;
}
/*
* Do not abort immediately since there may be
* temporary off LRU pages in the range. Still
* need migrate other LRU pages.
*/
if (migrate_page_add(page, qp->pagelist, flags))
has_unmovable = true;
} else
break;
}
pte_unmap_unlock(mapped_pte, ptl);
cond_resched();
if (has_unmovable)
return 1;
return addr != end ? -EIO : 0;
}
static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
int ret = 0;
#ifdef CONFIG_HUGETLB_PAGE
struct queue_pages *qp = walk->private;
unsigned long flags = (qp->flags & MPOL_MF_VALID);
struct page *page;
spinlock_t *ptl;
pte_t entry;
ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
entry = huge_ptep_get(pte);
if (!pte_present(entry))
goto unlock;
page = pte_page(entry);
if (!queue_pages_required(page, qp))
goto unlock;
if (flags == MPOL_MF_STRICT) {
/*
* STRICT alone means only detecting misplaced page and no
* need to further check other vma.
*/
ret = -EIO;
goto unlock;
}
if (!vma_migratable(walk->vma)) {
/*
* Must be STRICT with MOVE*, otherwise .test_walk() have
* stopped walking current vma.
* Detecting misplaced page but allow migrating pages which
* have been queued.
*/
ret = 1;
goto unlock;
}
/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
if (flags & (MPOL_MF_MOVE_ALL) ||
(flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
if (!isolate_huge_page(page, qp->pagelist) &&
(flags & MPOL_MF_STRICT))
/*
* Failed to isolate page but allow migrating pages
* which have been queued.
*/
ret = 1;
}
unlock:
spin_unlock(ptl);
#else
BUG();
#endif
return ret;
}
#ifdef CONFIG_NUMA_BALANCING
/*
* This is used to mark a range of virtual addresses to be inaccessible.
* These are later cleared by a NUMA hinting fault. Depending on these
* faults, pages may be migrated for better NUMA placement.
*
* This is assuming that NUMA faults are handled using PROT_NONE. If
* an architecture makes a different choice, it will need further
* changes to the core.
*/
unsigned long change_prot_numa(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
int nr_updated;
nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
if (nr_updated)
count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
return nr_updated;
}
#else
static unsigned long change_prot_numa(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
return 0;
}
#endif /* CONFIG_NUMA_BALANCING */
static int queue_pages_test_walk(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
struct queue_pages *qp = walk->private;
unsigned long endvma = vma->vm_end;
unsigned long flags = qp->flags;
/* range check first */
VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
if (!qp->first) {
qp->first = vma;
if (!(flags & MPOL_MF_DISCONTIG_OK) &&
(qp->start < vma->vm_start))
/* hole at head side of range */
return -EFAULT;
}
if (!(flags & MPOL_MF_DISCONTIG_OK) &&
((vma->vm_end < qp->end) &&
(!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
/* hole at middle or tail of range */
return -EFAULT;
/*
* Need check MPOL_MF_STRICT to return -EIO if possible
* regardless of vma_migratable
*/
if (!vma_migratable(vma) &&
!(flags & MPOL_MF_STRICT))
return 1;
if (endvma > end)
endvma = end;
if (flags & MPOL_MF_LAZY) {
/* Similar to task_numa_work, skip inaccessible VMAs */
if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
!(vma->vm_flags & VM_MIXEDMAP))
change_prot_numa(vma, start, endvma);
return 1;
}
/* queue pages from current vma */
if (flags & MPOL_MF_VALID)
return 0;
return 1;
}
static const struct mm_walk_ops queue_pages_walk_ops = {
.hugetlb_entry = queue_pages_hugetlb,
.pmd_entry = queue_pages_pte_range,
.test_walk = queue_pages_test_walk,
};
/*
* Walk through page tables and collect pages to be migrated.
*
* If pages found in a given range are on a set of nodes (determined by
* @nodes and @flags,) it's isolated and queued to the pagelist which is
* passed via @private.
*
* queue_pages_range() has three possible return values:
* 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
* specified.
* 0 - queue pages successfully or no misplaced page.
* errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
* memory range specified by nodemask and maxnode points outside
* your accessible address space (-EFAULT)
*/
static int
queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
nodemask_t *nodes, unsigned long flags,
struct list_head *pagelist)
{
int err;
struct queue_pages qp = {
.pagelist = pagelist,
.flags = flags,
.nmask = nodes,
.start = start,
.end = end,
.first = NULL,
};
err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
if (!qp.first)
/* whole range in hole */
err = -EFAULT;
return err;
}
/*
* Apply policy to a single VMA
* This must be called with the mmap_lock held for writing.
*/
static int vma_replace_policy(struct vm_area_struct *vma,
struct mempolicy *pol)
{
int err;
struct mempolicy *old;
struct mempolicy *new;
pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
vma->vm_start, vma->vm_end, vma->vm_pgoff,
vma->vm_ops, vma->vm_file,
vma->vm_ops ? vma->vm_ops->set_policy : NULL);
new = mpol_dup(pol);
if (IS_ERR(new))
return PTR_ERR(new);
if (vma->vm_ops && vma->vm_ops->set_policy) {
err = vma->vm_ops->set_policy(vma, new);
if (err)
goto err_out;
}
old = vma->vm_policy;
vma->vm_policy = new; /* protected by mmap_lock */
mpol_put(old);
return 0;
err_out:
mpol_put(new);
return err;
}
/* Step 2: apply policy to a range and do splits. */
static int mbind_range(struct mm_struct *mm, unsigned long start,
unsigned long end, struct mempolicy *new_pol)
{
struct vm_area_struct *prev;
struct vm_area_struct *vma;
int err = 0;
pgoff_t pgoff;
unsigned long vmstart;
unsigned long vmend;
vma = find_vma(mm, start);
VM_BUG_ON(!vma);
prev = vma->vm_prev;
if (start > vma->vm_start)
prev = vma;
for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) {
vmstart = max(start, vma->vm_start);
vmend = min(end, vma->vm_end);
if (mpol_equal(vma_policy(vma), new_pol))
continue;
pgoff = vma->vm_pgoff +
((vmstart - vma->vm_start) >> PAGE_SHIFT);
prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff,
new_pol, vma->vm_userfaultfd_ctx,
anon_vma_name(vma));
if (prev) {
vma = prev;
goto replace;
}
if (vma->vm_start != vmstart) {
err = split_vma(vma->vm_mm, vma, vmstart, 1);
if (err)
goto out;
}
if (vma->vm_end != vmend) {
err = split_vma(vma->vm_mm, vma, vmend, 0);
if (err)
goto out;
}
replace:
err = vma_replace_policy(vma, new_pol);
if (err)
goto out;
}
out:
return err;
}
/* Set the process memory policy */
static long do_set_mempolicy(unsigned short mode, unsigned short flags,
nodemask_t *nodes)
{
struct mempolicy *new, *old;
NODEMASK_SCRATCH(scratch);
int ret;
if (!scratch)
return -ENOMEM;
new = mpol_new(mode, flags, nodes);
if (IS_ERR(new)) {
ret = PTR_ERR(new);
goto out;
}
ret = mpol_set_nodemask(new, nodes, scratch);
if (ret) {
mpol_put(new);
goto out;
}
task_lock(current);
old = current->mempolicy;
current->mempolicy = new;
if (new && new->mode == MPOL_INTERLEAVE)
current->il_prev = MAX_NUMNODES-1;
task_unlock(current);
mpol_put(old);
ret = 0;
out:
NODEMASK_SCRATCH_FREE(scratch);
return ret;
}
/*
* Return nodemask for policy for get_mempolicy() query
*
* Called with task's alloc_lock held
*/
static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
{
nodes_clear(*nodes);
if (p == &default_policy)
return;
switch (p->mode) {
case MPOL_BIND:
case MPOL_INTERLEAVE:
case MPOL_PREFERRED:
case MPOL_PREFERRED_MANY:
*nodes = p->nodes;
break;
case MPOL_LOCAL:
/* return empty node mask for local allocation */
break;
default:
BUG();
}
}
static int lookup_node(struct mm_struct *mm, unsigned long addr)
{
struct page *p = NULL;
int err;
int locked = 1;
err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
if (err > 0) {
err = page_to_nid(p);
put_page(p);
}
if (locked)
mmap_read_unlock(mm);
return err;
}
/* Retrieve NUMA policy */
static long do_get_mempolicy(int *policy, nodemask_t *nmask,
unsigned long addr, unsigned long flags)
{
int err;
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
if (flags &
~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
return -EINVAL;
if (flags & MPOL_F_MEMS_ALLOWED) {
if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
return -EINVAL;
*policy = 0; /* just so it's initialized */
task_lock(current);
*nmask = cpuset_current_mems_allowed;
task_unlock(current);
return 0;
}
if (flags & MPOL_F_ADDR) {
/*
* Do NOT fall back to task policy if the
* vma/shared policy at addr is NULL. We
* want to return MPOL_DEFAULT in this case.
*/
mmap_read_lock(mm);
vma = vma_lookup(mm, addr);
if (!vma) {
mmap_read_unlock(mm);
return -EFAULT;
}
if (vma->vm_ops && vma->vm_ops->get_policy)
pol = vma->vm_ops->get_policy(vma, addr);
else
pol = vma->vm_policy;
} else if (addr)
return -EINVAL;
if (!pol)
pol = &default_policy; /* indicates default behavior */
if (flags & MPOL_F_NODE) {
if (flags & MPOL_F_ADDR) {
/*
* Take a refcount on the mpol, lookup_node()
* will drop the mmap_lock, so after calling
* lookup_node() only "pol" remains valid, "vma"
* is stale.
*/
pol_refcount = pol;
vma = NULL;
mpol_get(pol);
err = lookup_node(mm, addr);
if (err < 0)
goto out;
*policy = err;
} else if (pol == current->mempolicy &&
pol->mode == MPOL_INTERLEAVE) {
*policy = next_node_in(current->il_prev, pol->nodes);
} else {
err = -EINVAL;
goto out;
}
} else {
*policy = pol == &default_policy ? MPOL_DEFAULT :
pol->mode;
/*
* Internal mempolicy flags must be masked off before exposing
* the policy to userspace.
*/
*policy |= (pol->flags & MPOL_MODE_FLAGS);
}
err = 0;
if (nmask) {
if (mpol_store_user_nodemask(pol)) {
*nmask = pol->w.user_nodemask;
} else {
task_lock(current);
get_policy_nodemask(pol, nmask);
task_unlock(current);
}
}
out:
mpol_cond_put(pol);
if (vma)
mmap_read_unlock(mm);
if (pol_refcount)
mpol_put(pol_refcount);
return err;
}
#ifdef CONFIG_MIGRATION
/*
* page migration, thp tail pages can be passed.
*/
static int migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags)
{
struct page *head = compound_head(page);
/*
* Avoid migrating a page that is shared with others.
*/
if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
if (!isolate_lru_page(head)) {
list_add_tail(&head->lru, pagelist);
mod_node_page_state(page_pgdat(head),
NR_ISOLATED_ANON + page_is_file_lru(head),
thp_nr_pages(head));
} else if (flags & MPOL_MF_STRICT) {
/*
* Non-movable page may reach here. And, there may be
* temporary off LRU pages or non-LRU movable pages.
* Treat them as unmovable pages since they can't be
* isolated, so they can't be moved at the moment. It
* should return -EIO for this case too.
*/
return -EIO;
}
}
return 0;
}
/*
* Migrate pages from one node to a target node.
* Returns error or the number of pages not migrated.
*/
static int migrate_to_node(struct mm_struct *mm, int source, int dest,
int flags)
{
nodemask_t nmask;
LIST_HEAD(pagelist);
int err = 0;
struct migration_target_control mtc = {
.nid = dest,
.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
};
nodes_clear(nmask);
node_set(source, nmask);
/*
* This does not "check" the range but isolates all pages that
* need migration. Between passing in the full user address
* space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
*/
VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, alloc_migration_target, NULL,
(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
if (err)
putback_movable_pages(&pagelist);
}
return err;
}
/*
* Move pages between the two nodesets so as to preserve the physical
* layout as much as possible.
*
* Returns the number of page that could not be moved.
*/
int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to, int flags)
{
int busy = 0;
int err = 0;
nodemask_t tmp;
lru_cache_disable();
mmap_read_lock(mm);
/*
* Find a 'source' bit set in 'tmp' whose corresponding 'dest'
* bit in 'to' is not also set in 'tmp'. Clear the found 'source'
* bit in 'tmp', and return that <source, dest> pair for migration.
* The pair of nodemasks 'to' and 'from' define the map.
*
* If no pair of bits is found that way, fallback to picking some
* pair of 'source' and 'dest' bits that are not the same. If the
* 'source' and 'dest' bits are the same, this represents a node
* that will be migrating to itself, so no pages need move.
*
* If no bits are left in 'tmp', or if all remaining bits left
* in 'tmp' correspond to the same bit in 'to', return false
* (nothing left to migrate).
*
* This lets us pick a pair of nodes to migrate between, such that
* if possible the dest node is not already occupied by some other
* source node, minimizing the risk of overloading the memory on a
* node that would happen if we migrated incoming memory to a node
* before migrating outgoing memory source that same node.
*
* A single scan of tmp is sufficient. As we go, we remember the
* most recent <s, d> pair that moved (s != d). If we find a pair
* that not only moved, but what's better, moved to an empty slot
* (d is not set in tmp), then we break out then, with that pair.
* Otherwise when we finish scanning from_tmp, we at least have the
* most recent <s, d> pair that moved. If we get all the way through
* the scan of tmp without finding any node that moved, much less
* moved to an empty node, then there is nothing left worth migrating.
*/
tmp = *from;
while (!nodes_empty(tmp)) {
int s, d;
int source = NUMA_NO_NODE;
int dest = 0;
for_each_node_mask(s, tmp) {
/*
* do_migrate_pages() tries to maintain the relative
* node relationship of the pages established between
* threads and memory areas.
*
* However if the number of source nodes is not equal to
* the number of destination nodes we can not preserve
* this node relative relationship. In that case, skip
* copying memory from a node that is in the destination
* mask.
*
* Example: [2,3,4] -> [3,4,5] moves everything.
* [0-7] - > [3,4,5] moves only 0,1,2,6,7.
*/
if ((nodes_weight(*from) != nodes_weight(*to)) &&
(node_isset(s, *to)))
continue;
d = node_remap(s, *from, *to);
if (s == d)
continue;
source = s; /* Node moved. Memorize */
dest = d;
/* dest not in remaining from nodes? */
if (!node_isset(dest, tmp))
break;
}
if (source == NUMA_NO_NODE)
break;
node_clear(source, tmp);
err = migrate_to_node(mm, source, dest, flags);
if (err > 0)
busy += err;
if (err < 0)
break;
}
mmap_read_unlock(mm);
lru_cache_enable();
if (err < 0)
return err;
return busy;
}
/*
* Allocate a new page for page migration based on vma policy.
* Start by assuming the page is mapped by the same vma as contains @start.
* Search forward from there, if not. N.B., this assumes that the
* list of pages handed to migrate_pages()--which is how we get here--
* is in virtual address order.
*/
static struct page *new_page(struct page *page, unsigned long start)
{
struct vm_area_struct *vma;
unsigned long address;
vma = find_vma(current->mm, start);
while (vma) {
address = page_address_in_vma(page, vma);
if (address != -EFAULT)
break;
vma = vma->vm_next;
}
if (PageHuge(page)) {
return alloc_huge_page_vma(page_hstate(compound_head(page)),
vma, address);
} else if (PageTransHuge(page)) {
struct page *thp;
thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
HPAGE_PMD_ORDER);
if (!thp)
return NULL;
prep_transhuge_page(thp);
return thp;
}
/*
* if !vma, alloc_page_vma() will use task or system default policy
*/
return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
vma, address);
}
#else
static int migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags)
{
return -EIO;
}
int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to, int flags)
{
return -ENOSYS;
}
static struct page *new_page(struct page *page, unsigned long start)
{
return NULL;
}
#endif
static long do_mbind(unsigned long start, unsigned long len,
unsigned short mode, unsigned short mode_flags,
nodemask_t *nmask, unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct mempolicy *new;
unsigned long end;
int err;
int ret;
LIST_HEAD(pagelist);
if (flags & ~(unsigned long)MPOL_MF_VALID)
return -EINVAL;
if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
return -EPERM;
if (start & ~PAGE_MASK)
return -EINVAL;
if (mode == MPOL_DEFAULT)
flags &= ~MPOL_MF_STRICT;
len = (len + PAGE_SIZE - 1) & PAGE_MASK;
end = start + len;
if (end < start)
return -EINVAL;
if (end == start)
return 0;
new = mpol_new(mode, mode_flags, nmask);
if (IS_ERR(new))
return PTR_ERR(new);
if (flags & MPOL_MF_LAZY)
new->flags |= MPOL_F_MOF;
/*
* If we are using the default policy then operation
* on discontinuous address spaces is okay after all
*/
if (!new)
flags |= MPOL_MF_DISCONTIG_OK;
pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
start, start + len, mode, mode_flags,
nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
lru_cache_disable();
}
{
NODEMASK_SCRATCH(scratch);
if (scratch) {
mmap_write_lock(mm);
err = mpol_set_nodemask(new, nmask, scratch);
if (err)
mmap_write_unlock(mm);
} else
err = -ENOMEM;
NODEMASK_SCRATCH_FREE(scratch);
}
if (err)
goto mpol_out;
ret = queue_pages_range(mm, start, end, nmask,
flags | MPOL_MF_INVERT, &pagelist);
if (ret < 0) {
err = ret;
goto up_out;
}
err = mbind_range(mm, start, end, new);
if (!err) {
int nr_failed = 0;
if (!list_empty(&pagelist)) {
WARN_ON_ONCE(flags & MPOL_MF_LAZY);
nr_failed = migrate_pages(&pagelist, new_page, NULL,
start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL);
if (nr_failed)
putback_movable_pages(&pagelist);
}
if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
err = -EIO;
} else {
up_out:
if (!list_empty(&pagelist))
putback_movable_pages(&pagelist);
}
mmap_write_unlock(mm);
mpol_out:
mpol_put(new);
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
lru_cache_enable();
return err;
}
/*
* User space interface with variable sized bitmaps for nodelists.
*/
static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
unsigned long maxnode)
{
unsigned long nlongs = BITS_TO_LONGS(maxnode);
int ret;
if (in_compat_syscall())
ret = compat_get_bitmap(mask,
(const compat_ulong_t __user *)nmask,
maxnode);
else
ret = copy_from_user(mask, nmask,
nlongs * sizeof(unsigned long));
if (ret)
return -EFAULT;
if (maxnode % BITS_PER_LONG)
mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
return 0;
}
/* Copy a node mask from user space. */
static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
unsigned long maxnode)
{
--maxnode;
nodes_clear(*nodes);
if (maxnode == 0 || !nmask)
return 0;
if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
return -EINVAL;
/*
* When the user specified more nodes than supported just check
* if the non supported part is all zero, one word at a time,
* starting at the end.
*/
while (maxnode > MAX_NUMNODES) {
unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
unsigned long t;
if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
return -EFAULT;
if (maxnode - bits >= MAX_NUMNODES) {
maxnode -= bits;
} else {
maxnode = MAX_NUMNODES;
t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
}
if (t)
return -EINVAL;
}
return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
}
/* Copy a kernel node mask to user space */
static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
nodemask_t *nodes)
{
unsigned long copy = ALIGN(maxnode-1, 64) / 8;
unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
bool compat = in_compat_syscall();
if (compat)
nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
if (copy > nbytes) {
if (copy > PAGE_SIZE)
return -EINVAL;
if (clear_user((char __user *)mask + nbytes, copy - nbytes))
return -EFAULT;
copy = nbytes;
maxnode = nr_node_ids;
}
if (compat)
return compat_put_bitmap((compat_ulong_t __user *)mask,
nodes_addr(*nodes), maxnode);
return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
}
/* Basic parameter sanity check used by both mbind() and set_mempolicy() */
static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
{
*flags = *mode & MPOL_MODE_FLAGS;
*mode &= ~MPOL_MODE_FLAGS;
if ((unsigned int)(*mode) >= MPOL_MAX)
return -EINVAL;
if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
return -EINVAL;
if (*flags & MPOL_F_NUMA_BALANCING) {
if (*mode != MPOL_BIND)
return -EINVAL;
*flags |= (MPOL_F_MOF | MPOL_F_MORON);
}
return 0;
}
static long kernel_mbind(unsigned long start, unsigned long len,
unsigned long mode, const unsigned long __user *nmask,
unsigned long maxnode, unsigned int flags)
{
unsigned short mode_flags;
nodemask_t nodes;
int lmode = mode;
int err;
start = untagged_addr(start);
err = sanitize_mpol_flags(&lmode, &mode_flags);
if (err)
return err;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
}
SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
unsigned long, mode, const unsigned long __user *, nmask,
unsigned long, maxnode, unsigned int, flags)
{
return kernel_mbind(start, len, mode, nmask, maxnode, flags);
}
/* Set the process memory policy */
static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
unsigned long maxnode)
{
unsigned short mode_flags;
nodemask_t nodes;
int lmode = mode;
int err;
err = sanitize_mpol_flags(&lmode, &mode_flags);
if (err)
return err;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
return do_set_mempolicy(lmode, mode_flags, &nodes);
}
SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
unsigned long, maxnode)
{
return kernel_set_mempolicy(mode, nmask, maxnode);
}
static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
const unsigned long __user *old_nodes,
const unsigned long __user *new_nodes)
{
struct mm_struct *mm = NULL;
struct task_struct *task;
nodemask_t task_nodes;
int err;
nodemask_t *old;
nodemask_t *new;
NODEMASK_SCRATCH(scratch);
if (!scratch)
return -ENOMEM;
old = &scratch->mask1;
new = &scratch->mask2;
err = get_nodes(old, old_nodes, maxnode);
if (err)
goto out;
err = get_nodes(new, new_nodes, maxnode);
if (err)
goto out;
/* Find the mm_struct */
rcu_read_lock();
task = pid ? find_task_by_vpid(pid) : current;
if (!task) {
rcu_read_unlock();
err = -ESRCH;
goto out;
}
get_task_struct(task);
err = -EINVAL;
/*
* Check if this process has the right to modify the specified process.
* Use the regular "ptrace_may_access()" checks.
*/
if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
rcu_read_unlock();
err = -EPERM;
goto out_put;
}
rcu_read_unlock();
task_nodes = cpuset_mems_allowed(task);
/* Is the user allowed to access the target nodes? */
if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
err = -EPERM;
goto out_put;
}
task_nodes = cpuset_mems_allowed(current);
nodes_and(*new, *new, task_nodes);
if (nodes_empty(*new))
goto out_put;
err = security_task_movememory(task);
if (err)
goto out_put;
mm = get_task_mm(task);
put_task_struct(task);
if (!mm) {
err = -EINVAL;
goto out;
}
err = do_migrate_pages(mm, old, new,
capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
mmput(mm);
out:
NODEMASK_SCRATCH_FREE(scratch);
return err;
out_put:
put_task_struct(task);
goto out;
}
SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
const unsigned long __user *, old_nodes,
const unsigned long __user *, new_nodes)
{
return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
}
/* Retrieve NUMA policy */
static int kernel_get_mempolicy(int __user *policy,
unsigned long __user *nmask,
unsigned long maxnode,
unsigned long addr,
unsigned long flags)
{
int err;
int pval;
nodemask_t nodes;
if (nmask != NULL && maxnode < nr_node_ids)
return -EINVAL;
addr = untagged_addr(addr);
err = do_get_mempolicy(&pval, &nodes, addr, flags);
if (err)
return err;
if (policy && put_user(pval, policy))
return -EFAULT;
if (nmask)
err = copy_nodes_to_user(nmask, maxnode, &nodes);
return err;
}
SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
unsigned long __user *, nmask, unsigned long, maxnode,
unsigned long, addr, unsigned long, flags)
{
return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
}
bool vma_migratable(struct vm_area_struct *vma)
{
if (vma->vm_flags & (VM_IO | VM_PFNMAP))
return false;
/*
* DAX device mappings require predictable access latency, so avoid
* incurring periodic faults.
*/
if (vma_is_dax(vma))
return false;
if (is_vm_hugetlb_page(vma) &&
!hugepage_migration_supported(hstate_vma(vma)))
return false;
/*
* Migration allocates pages in the highest zone. If we cannot
* do so then migration (at least from node to node) is not
* possible.
*/
if (vma->vm_file &&
gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
< policy_zone)
return false;
return true;
}
struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
unsigned long addr)
{
struct mempolicy *pol = NULL;
if (vma) {
if (vma->vm_ops && vma->vm_ops->get_policy) {
pol = vma->vm_ops->get_policy(vma, addr);
} else if (vma->vm_policy) {
pol = vma->vm_policy;
/*
* shmem_alloc_page() passes MPOL_F_SHARED policy with
* a pseudo vma whose vma->vm_ops=NULL. Take a reference
* count on these policies which will be dropped by
* mpol_cond_put() later
*/
if (mpol_needs_cond_ref(pol))
mpol_get(pol);
}
}
return pol;
}
/*
* get_vma_policy(@vma, @addr)
* @vma: virtual memory area whose policy is sought
* @addr: address in @vma for shared policy lookup
*
* Returns effective policy for a VMA at specified address.
* Falls back to current->mempolicy or system default policy, as necessary.
* Shared policies [those marked as MPOL_F_SHARED] require an extra reference
* count--added by the get_policy() vm_op, as appropriate--to protect against
* freeing by another task. It is the caller's responsibility to free the
* extra reference for shared policies.
*/
static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
unsigned long addr)
{
struct mempolicy *pol = __get_vma_policy(vma, addr);
if (!pol)
pol = get_task_policy(current);
return pol;
}
bool vma_policy_mof(struct vm_area_struct *vma)
{
struct mempolicy *pol;
if (vma->vm_ops && vma->vm_ops->get_policy) {
bool ret = false;
pol = vma->vm_ops->get_policy(vma, vma->vm_start);
if (pol && (pol->flags & MPOL_F_MOF))
ret = true;
mpol_cond_put(pol);
return ret;
}
pol = vma->vm_policy;
if (!pol)
pol = get_task_policy(current);
return pol->flags & MPOL_F_MOF;
}
static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
{
enum zone_type dynamic_policy_zone = policy_zone;
BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
/*
* if policy->nodes has movable memory only,
* we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
*
* policy->nodes is intersect with node_states[N_MEMORY].
* so if the following test fails, it implies
* policy->nodes has movable memory only.
*/
if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
dynamic_policy_zone = ZONE_MOVABLE;
return zone >= dynamic_policy_zone;
}
/*
* Return a nodemask representing a mempolicy for filtering nodes for
* page allocation
*/
nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
{
int mode = policy->mode;
/* Lower zones don't get a nodemask applied for MPOL_BIND */
if (unlikely(mode == MPOL_BIND) &&
apply_policy_zone(policy, gfp_zone(gfp)) &&
cpuset_nodemask_valid_mems_allowed(&policy->nodes))
return &policy->nodes;
if (mode == MPOL_PREFERRED_MANY)
return &policy->nodes;
return NULL;
}
/*
* Return the preferred node id for 'prefer' mempolicy, and return
* the given id for all other policies.
*
* policy_node() is always coupled with policy_nodemask(), which
* secures the nodemask limit for 'bind' and 'prefer-many' policy.
*/
static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
{
if (policy->mode == MPOL_PREFERRED) {
nd = first_node(policy->nodes);
} else {
/*
* __GFP_THISNODE shouldn't even be used with the bind policy
* because we might easily break the expectation to stay on the
* requested node and not break the policy.
*/
WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
}
return nd;
}
/* Do dynamic interleaving for a process */
static unsigned interleave_nodes(struct mempolicy *policy)
{
unsigned next;
struct task_struct *me = current;
next = next_node_in(me->il_prev, policy->nodes);
if (next < MAX_NUMNODES)
me->il_prev = next;
return next;
}
/*
* Depending on the memory policy provide a node from which to allocate the
* next slab entry.
*/
unsigned int mempolicy_slab_node(void)
{
struct mempolicy *policy;
int node = numa_mem_id();
if (!in_task())
return node;
policy = current->mempolicy;
if (!policy)
return node;
switch (policy->mode) {
case MPOL_PREFERRED:
return first_node(policy->nodes);
case MPOL_INTERLEAVE:
return interleave_nodes(policy);
case MPOL_BIND:
case MPOL_PREFERRED_MANY:
{
struct zoneref *z;
/*
* Follow bind policy behavior and start allocation at the
* first node.
*/
struct zonelist *zonelist;
enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
z = first_zones_zonelist(zonelist, highest_zoneidx,
&policy->nodes);
return z->zone ? zone_to_nid(z->zone) : node;
}
case MPOL_LOCAL:
return node;
default:
BUG();
}
}
/*
* Do static interleaving for a VMA with known offset @n. Returns the n'th
* node in pol->nodes (starting from n=0), wrapping around if n exceeds the
* number of present nodes.
*/
static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
{
nodemask_t nodemask = pol->nodes;
unsigned int target, nnodes;
int i;
int nid;
/*
* The barrier will stabilize the nodemask in a register or on
* the stack so that it will stop changing under the code.
*
* Between first_node() and next_node(), pol->nodes could be changed
* by other threads. So we put pol->nodes in a local stack.
*/
barrier();
nnodes = nodes_weight(nodemask);
if (!nnodes)
return numa_node_id();
target = (unsigned int)n % nnodes;
nid = first_node(nodemask);
for (i = 0; i < target; i++)
nid = next_node(nid, nodemask);
return nid;
}
/* Determine a node number for interleave */
static inline unsigned interleave_nid(struct mempolicy *pol,
struct vm_area_struct *vma, unsigned long addr, int shift)
{
if (vma) {
unsigned long off;
/*
* for small pages, there is no difference between
* shift and PAGE_SHIFT, so the bit-shift is safe.
* for huge pages, since vm_pgoff is in units of small
* pages, we need to shift off the always 0 bits to get
* a useful offset.
*/
BUG_ON(shift < PAGE_SHIFT);
off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
off += (addr - vma->vm_start) >> shift;
return offset_il_node(pol, off);
} else
return interleave_nodes(pol);
}
#ifdef CONFIG_HUGETLBFS
/*
* huge_node(@vma, @addr, @gfp_flags, @mpol)
* @vma: virtual memory area whose policy is sought
* @addr: address in @vma for shared policy lookup and interleave policy
* @gfp_flags: for requested zone
* @mpol: pointer to mempolicy pointer for reference counted mempolicy
* @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
*
* Returns a nid suitable for a huge page allocation and a pointer
* to the struct mempolicy for conditional unref after allocation.
* If the effective policy is 'bind' or 'prefer-many', returns a pointer
* to the mempolicy's @nodemask for filtering the zonelist.
*
* Must be protected by read_mems_allowed_begin()
*/
int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
struct mempolicy **mpol, nodemask_t **nodemask)
{
int nid;
int mode;
*mpol = get_vma_policy(vma, addr);
*nodemask = NULL;
mode = (*mpol)->mode;
if (unlikely(mode == MPOL_INTERLEAVE)) {
nid = interleave_nid(*mpol, vma, addr,
huge_page_shift(hstate_vma(vma)));
} else {
nid = policy_node(gfp_flags, *mpol, numa_node_id());
if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY)
*nodemask = &(*mpol)->nodes;
}
return nid;
}
/*
* init_nodemask_of_mempolicy
*
* If the current task's mempolicy is "default" [NULL], return 'false'
* to indicate default policy. Otherwise, extract the policy nodemask
* for 'bind' or 'interleave' policy into the argument nodemask, or
* initialize the argument nodemask to contain the single node for
* 'preferred' or 'local' policy and return 'true' to indicate presence
* of non-default mempolicy.
*
* We don't bother with reference counting the mempolicy [mpol_get/put]
* because the current task is examining it's own mempolicy and a task's
* mempolicy is only ever changed by the task itself.
*
* N.B., it is the caller's responsibility to free a returned nodemask.
*/
bool init_nodemask_of_mempolicy(nodemask_t *mask)
{
struct mempolicy *mempolicy;
if (!(mask && current->mempolicy))
return false;
task_lock(current);
mempolicy = current->mempolicy;
switch (mempolicy->mode) {
case MPOL_PREFERRED:
case MPOL_PREFERRED_MANY:
case MPOL_BIND:
case MPOL_INTERLEAVE:
*mask = mempolicy->nodes;
break;
case MPOL_LOCAL:
init_nodemask_of_node(mask, numa_node_id());
break;
default:
BUG();
}
task_unlock(current);
return true;
}
#endif
/*
* mempolicy_in_oom_domain
*
* If tsk's mempolicy is "bind", check for intersection between mask and
* the policy nodemask. Otherwise, return true for all other policies
* including "interleave", as a tsk with "interleave" policy may have
* memory allocated from all nodes in system.
*
* Takes task_lock(tsk) to prevent freeing of its mempolicy.
*/
bool mempolicy_in_oom_domain(struct task_struct *tsk,
const nodemask_t *mask)
{
struct mempolicy *mempolicy;
bool ret = true;
if (!mask)
return ret;
task_lock(tsk);
mempolicy = tsk->mempolicy;
if (mempolicy && mempolicy->mode == MPOL_BIND)
ret = nodes_intersects(mempolicy->nodes, *mask);
task_unlock(tsk);
return ret;
}
/* Allocate a page in interleaved policy.
Own path because it needs to do special accounting. */
static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
unsigned nid)
{
struct page *page;
page = __alloc_pages(gfp, order, nid, NULL);
/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
if (!static_branch_likely(&vm_numa_stat_key))
return page;
if (page && page_to_nid(page) == nid) {
preempt_disable();
__count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
preempt_enable();
}
return page;
}
static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
int nid, struct mempolicy *pol)
{
struct page *page;
gfp_t preferred_gfp;
/*
* This is a two pass approach. The first pass will only try the
* preferred nodes but skip the direct reclaim and allow the
* allocation to fail, while the second pass will try all the
* nodes in system.
*/
preferred_gfp = gfp | __GFP_NOWARN;
preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes);
if (!page)
page = __alloc_pages(gfp, order, numa_node_id(), NULL);
return page;
}
/**
* alloc_pages_vma - Allocate a page for a VMA.
* @gfp: GFP flags.
* @order: Order of the GFP allocation.
* @vma: Pointer to VMA or NULL if not available.
* @addr: Virtual address of the allocation. Must be inside @vma.
* @node: Which node to prefer for allocation (modulo policy).
* @hugepage: For hugepages try only the preferred node if possible.
*
* Allocate a page for a specific address in @vma, using the appropriate
* NUMA policy. When @vma is not NULL the caller must hold the mmap_lock
* of the mm_struct of the VMA to prevent it from going away. Should be
* used for all allocations for pages that will be mapped into user space.
*
* Return: The page on success or NULL if allocation fails.
*/
struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
unsigned long addr, int node, bool hugepage)
{
struct mempolicy *pol;
struct page *page;
int preferred_nid;
nodemask_t *nmask;
pol = get_vma_policy(vma, addr);
if (pol->mode == MPOL_INTERLEAVE) {
unsigned nid;
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
mpol_cond_put(pol);
page = alloc_page_interleave(gfp, order, nid);
goto out;
}
if (pol->mode == MPOL_PREFERRED_MANY) {
page = alloc_pages_preferred_many(gfp, order, node, pol);
mpol_cond_put(pol);
goto out;
}
if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
int hpage_node = node;
/*
* For hugepage allocation and non-interleave policy which
* allows the current node (or other explicitly preferred
* node) we only try to allocate from the current/preferred
* node and don't fall back to other nodes, as the cost of
* remote accesses would likely offset THP benefits.
*
* If the policy is interleave or does not allow the current
* node in its nodemask, we allocate the standard way.
*/
if (pol->mode == MPOL_PREFERRED)
hpage_node = first_node(pol->nodes);
nmask = policy_nodemask(gfp, pol);
if (!nmask || node_isset(hpage_node, *nmask)) {
mpol_cond_put(pol);
/*
* First, try to allocate THP only on local node, but
* don't reclaim unnecessarily, just compact.
*/
page = __alloc_pages_node(hpage_node,
gfp | __GFP_THISNODE | __GFP_NORETRY, order);
/*
* If hugepage allocations are configured to always
* synchronous compact or the vma has been madvised
* to prefer hugepage backing, retry allowing remote
* memory with both reclaim and compact as well.
*/
if (!page && (gfp & __GFP_DIRECT_RECLAIM))
page = __alloc_pages(gfp, order, hpage_node, nmask);
goto out;
}
}
nmask = policy_nodemask(gfp, pol);
preferred_nid = policy_node(gfp, pol, node);
page = __alloc_pages(gfp, order, preferred_nid, nmask);
mpol_cond_put(pol);
out:
return page;
}
EXPORT_SYMBOL(alloc_pages_vma);
/**
* alloc_pages - Allocate pages.
* @gfp: GFP flags.
* @order: Power of two of number of pages to allocate.
*
* Allocate 1 << @order contiguous pages. The physical address of the
* first page is naturally aligned (eg an order-3 allocation will be aligned
* to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current
* process is honoured when in process context.
*
* Context: Can be called from any context, providing the appropriate GFP
* flags are used.
* Return: The page on success or NULL if allocation fails.
*/
struct page *alloc_pages(gfp_t gfp, unsigned order)
{
struct mempolicy *pol = &default_policy;
struct page *page;
if (!in_interrupt() && !(gfp & __GFP_THISNODE))
pol = get_task_policy(current);
/*
* No reference counting needed for current->mempolicy
* nor system default_policy
*/
if (pol->mode == MPOL_INTERLEAVE)
page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
else if (pol->mode == MPOL_PREFERRED_MANY)
page = alloc_pages_preferred_many(gfp, order,
numa_node_id(), pol);
else
page = __alloc_pages(gfp, order,
policy_node(gfp, pol, numa_node_id()),
policy_nodemask(gfp, pol));
return page;
}
EXPORT_SYMBOL(alloc_pages);
int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
{
struct mempolicy *pol = mpol_dup(vma_policy(src));
if (IS_ERR(pol))
return PTR_ERR(pol);
dst->vm_policy = pol;
return 0;
}
/*
* If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
* rebinds the mempolicy its copying by calling mpol_rebind_policy()
* with the mems_allowed returned by cpuset_mems_allowed(). This
* keeps mempolicies cpuset relative after its cpuset moves. See
* further kernel/cpuset.c update_nodemask().
*
* current's mempolicy may be rebinded by the other task(the task that changes
* cpuset's mems), so we needn't do rebind work for current task.
*/
/* Slow path of a mempolicy duplicate */
struct mempolicy *__mpol_dup(struct mempolicy *old)
{
struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!new)
return ERR_PTR(-ENOMEM);
/* task's mempolicy is protected by alloc_lock */
if (old == current->mempolicy) {
task_lock(current);
*new = *old;
task_unlock(current);
} else
*new = *old;
if (current_cpuset_is_being_rebound()) {
nodemask_t mems = cpuset_mems_allowed(current);
mpol_rebind_policy(new, &mems);
}
atomic_set(&new->refcnt, 1);
return new;
}
/* Slow path of a mempolicy comparison */
bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
if (!a || !b)
return false;
if (a->mode != b->mode)
return false;
if (a->flags != b->flags)
return false;
if (mpol_store_user_nodemask(a))
if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
return false;
switch (a->mode) {
case MPOL_BIND:
case MPOL_INTERLEAVE:
case MPOL_PREFERRED:
case MPOL_PREFERRED_MANY:
return !!nodes_equal(a->nodes, b->nodes);
case MPOL_LOCAL:
return true;
default:
BUG();
return false;
}
}
/*
* Shared memory backing store policy support.
*
* Remember policies even when nobody has shared memory mapped.
* The policies are kept in Red-Black tree linked from the inode.
* They are protected by the sp->lock rwlock, which should be held
* for any accesses to the tree.
*/
/*
* lookup first element intersecting start-end. Caller holds sp->lock for
* reading or for writing
*/
static struct sp_node *
sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
{
struct rb_node *n = sp->root.rb_node;
while (n) {
struct sp_node *p = rb_entry(n, struct sp_node, nd);
if (start >= p->end)
n = n->rb_right;
else if (end <= p->start)
n = n->rb_left;
else
break;
}
if (!n)
return NULL;
for (;;) {
struct sp_node *w = NULL;
struct rb_node *prev = rb_prev(n);
if (!prev)
break;
w = rb_entry(prev, struct sp_node, nd);
if (w->end <= start)
break;
n = prev;
}
return rb_entry(n, struct sp_node, nd);
}
/*
* Insert a new shared policy into the list. Caller holds sp->lock for
* writing.
*/
static void sp_insert(struct shared_policy *sp, struct sp_node *new)
{
struct rb_node **p = &sp->root.rb_node;
struct rb_node *parent = NULL;
struct sp_node *nd;
while (*p) {
parent = *p;
nd = rb_entry(parent, struct sp_node, nd);
if (new->start < nd->start)
p = &(*p)->rb_left;
else if (new->end > nd->end)
p = &(*p)->rb_right;
else
BUG();
}
rb_link_node(&new->nd, parent, p);
rb_insert_color(&new->nd, &sp->root);
pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
new->policy ? new->policy->mode : 0);
}
/* Find shared policy intersecting idx */
struct mempolicy *
mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
{
struct mempolicy *pol = NULL;
struct sp_node *sn;
if (!sp->root.rb_node)
return NULL;
read_lock(&sp->lock);
sn = sp_lookup(sp, idx, idx+1);
if (sn) {
mpol_get(sn->policy);
pol = sn->policy;
}
read_unlock(&sp->lock);
return pol;
}
static void sp_free(struct sp_node *n)
{
mpol_put(n->policy);
kmem_cache_free(sn_cache, n);
}
/**
* mpol_misplaced - check whether current page node is valid in policy
*
* @page: page to be checked
* @vma: vm area where page mapped
* @addr: virtual address where page mapped
*
* Lookup current policy node id for vma,addr and "compare to" page's
* node id. Policy determination "mimics" alloc_page_vma().
* Called from fault path where we know the vma and faulting address.
*
* Return: NUMA_NO_NODE if the page is in a node that is valid for this
* policy, or a suitable node ID to allocate a replacement page from.
*/
int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
{
struct mempolicy *pol;
struct zoneref *z;
int curnid = page_to_nid(page);
unsigned long pgoff;
int thiscpu = raw_smp_processor_id();
int thisnid = cpu_to_node(thiscpu);
int polnid = NUMA_NO_NODE;
int ret = NUMA_NO_NODE;
pol = get_vma_policy(vma, addr);
if (!(pol->flags & MPOL_F_MOF))
goto out;
switch (pol->mode) {
case MPOL_INTERLEAVE:
pgoff = vma->vm_pgoff;
pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
polnid = offset_il_node(pol, pgoff);
break;
case MPOL_PREFERRED:
if (node_isset(curnid, pol->nodes))
goto out;
polnid = first_node(pol->nodes);
break;
case MPOL_LOCAL:
polnid = numa_node_id();
break;
case MPOL_BIND:
/* Optimize placement among multiple nodes via NUMA balancing */
if (pol->flags & MPOL_F_MORON) {
if (node_isset(thisnid, pol->nodes))
break;
goto out;
}
fallthrough;
case MPOL_PREFERRED_MANY:
/*
* use current page if in policy nodemask,
* else select nearest allowed node, if any.
* If no allowed nodes, use current [!misplaced].
*/
if (node_isset(curnid, pol->nodes))
goto out;
z = first_zones_zonelist(
node_zonelist(numa_node_id(), GFP_HIGHUSER),
gfp_zone(GFP_HIGHUSER),
&pol->nodes);
polnid = zone_to_nid(z->zone);
break;
default:
BUG();
}
/* Migrate the page towards the node whose CPU is referencing it */
if (pol->flags & MPOL_F_MORON) {
polnid = thisnid;
if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
goto out;
}
if (curnid != polnid)
ret = polnid;
out:
mpol_cond_put(pol);
return ret;
}
/*
* Drop the (possibly final) reference to task->mempolicy. It needs to be
* dropped after task->mempolicy is set to NULL so that any allocation done as
* part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
* policy.
*/
void mpol_put_task_policy(struct task_struct *task)
{
struct mempolicy *pol;
task_lock(task);
pol = task->mempolicy;
task->mempolicy = NULL;
task_unlock(task);
mpol_put(pol);
}
static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
pr_debug("deleting %lx-l%lx\n", n->start, n->end);
rb_erase(&n->nd, &sp->root);
sp_free(n);
}
static void sp_node_init(struct sp_node *node, unsigned long start,
unsigned long end, struct mempolicy *pol)
{
node->start = start;
node->end = end;
node->policy = pol;
}
static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
struct mempolicy *pol)
{
struct sp_node *n;
struct mempolicy *newpol;
n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
if (!n)
return NULL;
newpol = mpol_dup(pol);
if (IS_ERR(newpol)) {
kmem_cache_free(sn_cache, n);
return NULL;
}
newpol->flags |= MPOL_F_SHARED;
sp_node_init(n, start, end, newpol);
return n;
}
/* Replace a policy range. */
static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
unsigned long end, struct sp_node *new)
{
struct sp_node *n;
struct sp_node *n_new = NULL;
struct mempolicy *mpol_new = NULL;
int ret = 0;
restart:
write_lock(&sp->lock);
n = sp_lookup(sp, start, end);
/* Take care of old policies in the same range. */
while (n && n->start < end) {
struct rb_node *next = rb_next(&n->nd);
if (n->start >= start) {
if (n->end <= end)
sp_delete(sp, n);
else
n->start = end;
} else {
/* Old policy spanning whole new range. */
if (n->end > end) {
if (!n_new)
goto alloc_new;
*mpol_new = *n->policy;
atomic_set(&mpol_new->refcnt, 1);
sp_node_init(n_new, end, n->end, mpol_new);
n->end = start;
sp_insert(sp, n_new);
n_new = NULL;
mpol_new = NULL;
break;
} else
n->end = start;
}
if (!next)
break;
n = rb_entry(next, struct sp_node, nd);
}
if (new)
sp_insert(sp, new);
write_unlock(&sp->lock);
ret = 0;
err_out:
if (mpol_new)
mpol_put(mpol_new);
if (n_new)
kmem_cache_free(sn_cache, n_new);
return ret;
alloc_new:
write_unlock(&sp->lock);
ret = -ENOMEM;
n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
if (!n_new)
goto err_out;
mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!mpol_new)
goto err_out;
atomic_set(&mpol_new->refcnt, 1);
goto restart;
}
/**
* mpol_shared_policy_init - initialize shared policy for inode
* @sp: pointer to inode shared policy
* @mpol: struct mempolicy to install
*
* Install non-NULL @mpol in inode's shared policy rb-tree.
* On entry, the current task has a reference on a non-NULL @mpol.
* This must be released on exit.
* This is called at get_inode() calls and we can use GFP_KERNEL.
*/
void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
{
int ret;
sp->root = RB_ROOT; /* empty tree == default mempolicy */
rwlock_init(&sp->lock);
if (mpol) {
struct vm_area_struct pvma;
struct mempolicy *new;
NODEMASK_SCRATCH(scratch);
if (!scratch)
goto put_mpol;
/* contextualize the tmpfs mount point mempolicy */
new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
if (IS_ERR(new))
goto free_scratch; /* no valid nodemask intersection */
task_lock(current);
ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
task_unlock(current);
if (ret)
goto put_new;
/* Create pseudo-vma that contains just the policy */
vma_init(&pvma, NULL);
pvma.vm_end = TASK_SIZE; /* policy covers entire file */
mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
put_new:
mpol_put(new); /* drop initial ref */
free_scratch:
NODEMASK_SCRATCH_FREE(scratch);
put_mpol:
mpol_put(mpol); /* drop our incoming ref on sb mpol */
}
}
int mpol_set_shared_policy(struct shared_policy *info,
struct vm_area_struct *vma, struct mempolicy *npol)
{
int err;
struct sp_node *new = NULL;
unsigned long sz = vma_pages(vma);
pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
vma->vm_pgoff,
sz, npol ? npol->mode : -1,
npol ? npol->flags : -1,
npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE);
if (npol) {
new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
if (!new)
return -ENOMEM;
}
err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
if (err && new)
sp_free(new);
return err;
}
/* Free a backing policy store on inode delete. */
void mpol_free_shared_policy(struct shared_policy *p)
{
struct sp_node *n;
struct rb_node *next;
if (!p->root.rb_node)
return;
write_lock(&p->lock);
next = rb_first(&p->root);
while (next) {
n = rb_entry(next, struct sp_node, nd);
next = rb_next(&n->nd);
sp_delete(p, n);
}
write_unlock(&p->lock);
}
#ifdef CONFIG_NUMA_BALANCING
static int __initdata numabalancing_override;
static void __init check_numabalancing_enable(void)
{
bool numabalancing_default = false;
if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
numabalancing_default = true;
/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
if (numabalancing_override)
set_numabalancing_state(numabalancing_override == 1);
if (num_online_nodes() > 1 && !numabalancing_override) {
pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
numabalancing_default ? "Enabling" : "Disabling");
set_numabalancing_state(numabalancing_default);
}
}
static int __init setup_numabalancing(char *str)
{
int ret = 0;
if (!str)
goto out;
if (!strcmp(str, "enable")) {
numabalancing_override = 1;
ret = 1;
} else if (!strcmp(str, "disable")) {
numabalancing_override = -1;
ret = 1;
}
out:
if (!ret)
pr_warn("Unable to parse numa_balancing=\n");
return ret;
}
__setup("numa_balancing=", setup_numabalancing);
#else
static inline void __init check_numabalancing_enable(void)
{
}
#endif /* CONFIG_NUMA_BALANCING */
/* assumes fs == KERNEL_DS */
void __init numa_policy_init(void)
{
nodemask_t interleave_nodes;
unsigned long largest = 0;
int nid, prefer = 0;
policy_cache = kmem_cache_create("numa_policy",
sizeof(struct mempolicy),
0, SLAB_PANIC, NULL);
sn_cache = kmem_cache_create("shared_policy_node",
sizeof(struct sp_node),
0, SLAB_PANIC, NULL);
for_each_node(nid) {
preferred_node_policy[nid] = (struct mempolicy) {
.refcnt = ATOMIC_INIT(1),
.mode = MPOL_PREFERRED,
.flags = MPOL_F_MOF | MPOL_F_MORON,
.nodes = nodemask_of_node(nid),
};
}
/*
* Set interleaving policy for system init. Interleaving is only
* enabled across suitably sized nodes (default is >= 16MB), or
* fall back to the largest node if they're all smaller.
*/
nodes_clear(interleave_nodes);
for_each_node_state(nid, N_MEMORY) {
unsigned long total_pages = node_present_pages(nid);
/* Preserve the largest node */
if (largest < total_pages) {
largest = total_pages;
prefer = nid;
}
/* Interleave this node? */
if ((total_pages << PAGE_SHIFT) >= (16 << 20))
node_set(nid, interleave_nodes);
}
/* All too small, use the largest */
if (unlikely(nodes_empty(interleave_nodes)))
node_set(prefer, interleave_nodes);
if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
pr_err("%s: interleaving failed\n", __func__);
check_numabalancing_enable();
}
/* Reset policy of current process to default */
void numa_default_policy(void)
{
do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
}
/*
* Parse and format mempolicy from/to strings
*/
static const char * const policy_modes[] =
{
[MPOL_DEFAULT] = "default",
[MPOL_PREFERRED] = "prefer",
[MPOL_BIND] = "bind",
[MPOL_INTERLEAVE] = "interleave",
[MPOL_LOCAL] = "local",
[MPOL_PREFERRED_MANY] = "prefer (many)",
};
#ifdef CONFIG_TMPFS
/**
* mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
* @str: string containing mempolicy to parse
* @mpol: pointer to struct mempolicy pointer, returned on success.
*
* Format of input:
* <mode>[=<flags>][:<nodelist>]
*
* On success, returns 0, else 1
*/
int mpol_parse_str(char *str, struct mempolicy **mpol)
{
struct mempolicy *new = NULL;
unsigned short mode_flags;
nodemask_t nodes;
char *nodelist = strchr(str, ':');
char *flags = strchr(str, '=');
int err = 1, mode;
if (flags)
*flags++ = '\0'; /* terminate mode string */
if (nodelist) {
/* NUL-terminate mode or flags string */
*nodelist++ = '\0';
if (nodelist_parse(nodelist, nodes))
goto out;
if (!nodes_subset(nodes, node_states[N_MEMORY]))
goto out;
} else
nodes_clear(nodes);
mode = match_string(policy_modes, MPOL_MAX, str);
if (mode < 0)
goto out;
switch (mode) {
case MPOL_PREFERRED:
/*
* Insist on a nodelist of one node only, although later
* we use first_node(nodes) to grab a single node, so here
* nodelist (or nodes) cannot be empty.
*/
if (nodelist) {
char *rest = nodelist;
while (isdigit(*rest))
rest++;
if (*rest)
goto out;
if (nodes_empty(nodes))
goto out;
}
break;
case MPOL_INTERLEAVE:
/*
* Default to online nodes with memory if no nodelist
*/
if (!nodelist)
nodes = node_states[N_MEMORY];
break;
case MPOL_LOCAL:
/*
* Don't allow a nodelist; mpol_new() checks flags
*/
if (nodelist)
goto out;
break;
case MPOL_DEFAULT:
/*
* Insist on a empty nodelist
*/
if (!nodelist)
err = 0;
goto out;
case MPOL_PREFERRED_MANY:
case MPOL_BIND:
/*
* Insist on a nodelist
*/
if (!nodelist)
goto out;
}
mode_flags = 0;
if (flags) {
/*
* Currently, we only support two mutually exclusive
* mode flags.
*/
if (!strcmp(flags, "static"))
mode_flags |= MPOL_F_STATIC_NODES;
else if (!strcmp(flags, "relative"))
mode_flags |= MPOL_F_RELATIVE_NODES;
else
goto out;
}
new = mpol_new(mode, mode_flags, &nodes);
if (IS_ERR(new))
goto out;
/*
* Save nodes for mpol_to_str() to show the tmpfs mount options
* for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
*/
if (mode != MPOL_PREFERRED) {
new->nodes = nodes;
} else if (nodelist) {
nodes_clear(new->nodes);
node_set(first_node(nodes), new->nodes);
} else {
new->mode = MPOL_LOCAL;
}
/*
* Save nodes for contextualization: this will be used to "clone"
* the mempolicy in a specific context [cpuset] at a later time.
*/
new->w.user_nodemask = nodes;
err = 0;
out:
/* Restore string for error message */
if (nodelist)
*--nodelist = ':';
if (flags)
*--flags = '=';
if (!err)
*mpol = new;
return err;
}
#endif /* CONFIG_TMPFS */
/**
* mpol_to_str - format a mempolicy structure for printing
* @buffer: to contain formatted mempolicy string
* @maxlen: length of @buffer
* @pol: pointer to mempolicy to be formatted
*
* Convert @pol into a string. If @buffer is too short, truncate the string.
* Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
* longest flag, "relative", and to display at least a few node ids.
*/
void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
{
char *p = buffer;
nodemask_t nodes = NODE_MASK_NONE;
unsigned short mode = MPOL_DEFAULT;
unsigned short flags = 0;
if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
mode = pol->mode;
flags = pol->flags;
}
switch (mode) {
case MPOL_DEFAULT:
case MPOL_LOCAL:
break;
case MPOL_PREFERRED:
case MPOL_PREFERRED_MANY:
case MPOL_BIND:
case MPOL_INTERLEAVE:
nodes = pol->nodes;
break;
default:
WARN_ON_ONCE(1);
snprintf(p, maxlen, "unknown");
return;
}
p += snprintf(p, maxlen, "%s", policy_modes[mode]);
if (flags & MPOL_MODE_FLAGS) {
p += snprintf(p, buffer + maxlen - p, "=");
/*
* Currently, the only defined flags are mutually exclusive
*/
if (flags & MPOL_F_STATIC_NODES)
p += snprintf(p, buffer + maxlen - p, "static");
else if (flags & MPOL_F_RELATIVE_NODES)
p += snprintf(p, buffer + maxlen - p, "relative");
}
if (!nodes_empty(nodes))
p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
nodemask_pr_args(&nodes));
}
bool numa_demotion_enabled = false;
#ifdef CONFIG_SYSFS
static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sysfs_emit(buf, "%s\n",
numa_demotion_enabled? "true" : "false");
}
static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
numa_demotion_enabled = true;
else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
numa_demotion_enabled = false;
else
return -EINVAL;
return count;
}
static struct kobj_attribute numa_demotion_enabled_attr =
__ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
numa_demotion_enabled_store);
static struct attribute *numa_attrs[] = {
&numa_demotion_enabled_attr.attr,
NULL,
};
static const struct attribute_group numa_attr_group = {
.attrs = numa_attrs,
};
static int __init numa_init_sysfs(void)
{
int err;
struct kobject *numa_kobj;
numa_kobj = kobject_create_and_add("numa", mm_kobj);
if (!numa_kobj) {
pr_err("failed to create numa kobject\n");
return -ENOMEM;
}
err = sysfs_create_group(numa_kobj, &numa_attr_group);
if (err) {
pr_err("failed to register numa group\n");
goto delete_obj;
}
return 0;
delete_obj:
kobject_put(numa_kobj);
return err;
}
subsys_initcall(numa_init_sysfs);
#endif