Files
kernel_arpi/kernel/bpf/syscall.c
Greg Kroah-Hartman 335a32d033 Merge 5.15.75 into android14-5.15
Changes in 5.15.75
	Revert "fs: check FMODE_LSEEK to control internal pipe splicing"
	ALSA: oss: Fix potential deadlock at unregistration
	ALSA: rawmidi: Drop register_mutex in snd_rawmidi_free()
	ALSA: usb-audio: Fix potential memory leaks
	ALSA: usb-audio: Fix NULL dererence at error path
	ALSA: hda/realtek: remove ALC289_FIXUP_DUAL_SPK for Dell 5530
	ALSA: hda/realtek: Correct pin configs for ASUS G533Z
	ALSA: hda/realtek: Add quirk for ASUS GV601R laptop
	ALSA: hda/realtek: Add Intel Reference SSID to support headset keys
	mtd: rawnand: atmel: Unmap streaming DMA mappings
	io_uring/net: don't update msg_name if not provided
	hv_netvsc: Fix race between VF offering and VF association message from host
	cifs: destage dirty pages before re-reading them for cache=none
	cifs: Fix the error length of VALIDATE_NEGOTIATE_INFO message
	iio: dac: ad5593r: Fix i2c read protocol requirements
	iio: ltc2497: Fix reading conversion results
	iio: adc: ad7923: fix channel readings for some variants
	iio: pressure: dps310: Refactor startup procedure
	iio: pressure: dps310: Reset chip after timeout
	xhci: dbc: Fix memory leak in xhci_alloc_dbc()
	usb: add quirks for Lenovo OneLink+ Dock
	can: kvaser_usb: Fix use of uninitialized completion
	can: kvaser_usb_leaf: Fix overread with an invalid command
	can: kvaser_usb_leaf: Fix TX queue out of sync after restart
	can: kvaser_usb_leaf: Fix CAN state after restart
	mmc: sdhci-sprd: Fix minimum clock limit
	i2c: designware: Fix handling of real but unexpected device interrupts
	fs: dlm: fix race between test_bit() and queue_work()
	fs: dlm: handle -EBUSY first in lock arg validation
	HID: multitouch: Add memory barriers
	quota: Check next/prev free block number after reading from quota file
	platform/chrome: cros_ec_proto: Update version on GET_NEXT_EVENT failure
	ASoC: wcd9335: fix order of Slimbus unprepare/disable
	ASoC: wcd934x: fix order of Slimbus unprepare/disable
	hwmon: (gsc-hwmon) Call of_node_get() before of_find_xxx API
	net: thunderbolt: Enable DMA paths only after rings are enabled
	regulator: qcom_rpm: Fix circular deferral regression
	arm64: topology: move store_cpu_topology() to shared code
	riscv: topology: fix default topology reporting
	RISC-V: Make port I/O string accessors actually work
	parisc: fbdev/stifb: Align graphics memory size to 4MB
	riscv: Allow PROT_WRITE-only mmap()
	riscv: Make VM_WRITE imply VM_READ
	riscv: always honor the CONFIG_CMDLINE_FORCE when parsing dtb
	riscv: Pass -mno-relax only on lld < 15.0.0
	UM: cpuinfo: Fix a warning for CONFIG_CPUMASK_OFFSTACK
	nvmem: core: Fix memleak in nvmem_register()
	nvme-multipath: fix possible hang in live ns resize with ANA access
	nvme-pci: set min_align_mask before calculating max_hw_sectors
	Revert "drm/amdgpu: use dirty framebuffer helper"
	dmaengine: mxs: use platform_driver_register
	drm/virtio: Check whether transferred 2D BO is shmem
	drm/virtio: Unlock reservations on virtio_gpu_object_shmem_init() error
	drm/virtio: Use appropriate atomic state in virtio_gpu_plane_cleanup_fb()
	drm/udl: Restore display mode on resume
	arm64: errata: Add Cortex-A55 to the repeat tlbi list
	mm/damon: validate if the pmd entry is present before accessing
	mm/mmap: undo ->mmap() when arch_validate_flags() fails
	xen/gntdev: Prevent leaking grants
	xen/gntdev: Accommodate VMA splitting
	PCI: Sanitise firmware BAR assignments behind a PCI-PCI bridge
	serial: 8250: Let drivers request full 16550A feature probing
	serial: 8250: Request full 16550A feature probing for OxSemi PCIe devices
	NFSD: Protect against send buffer overflow in NFSv3 READDIR
	NFSD: Protect against send buffer overflow in NFSv2 READ
	NFSD: Protect against send buffer overflow in NFSv3 READ
	powercap: intel_rapl: Use standard Energy Unit for SPR Dram RAPL domain
	powerpc/boot: Explicitly disable usage of SPE instructions
	slimbus: qcom-ngd: use correct error in message of pdr_add_lookup() failure
	slimbus: qcom-ngd: cleanup in probe error path
	scsi: qedf: Populate sysfs attributes for vport
	gpio: rockchip: request GPIO mux to pinctrl when setting direction
	pinctrl: rockchip: add pinmux_ops.gpio_set_direction callback
	fbdev: smscufx: Fix use-after-free in ufx_ops_open()
	ksmbd: fix endless loop when encryption for response fails
	ksmbd: Fix wrong return value and message length check in smb2_ioctl()
	ksmbd: Fix user namespace mapping
	fs: record I_DIRTY_TIME even if inode already has I_DIRTY_INODE
	btrfs: fix race between quota enable and quota rescan ioctl
	btrfs: set generation before calling btrfs_clean_tree_block in btrfs_init_new_buffer
	f2fs: complete checkpoints during remount
	f2fs: flush pending checkpoints when freezing super
	f2fs: increase the limit for reserve_root
	f2fs: fix to do sanity check on destination blkaddr during recovery
	f2fs: fix to do sanity check on summary info
	hardening: Avoid harmless Clang option under CONFIG_INIT_STACK_ALL_ZERO
	hardening: Remove Clang's enable flag for -ftrivial-auto-var-init=zero
	jbd2: wake up journal waiters in FIFO order, not LIFO
	jbd2: fix potential buffer head reference count leak
	jbd2: fix potential use-after-free in jbd2_fc_wait_bufs
	jbd2: add miss release buffer head in fc_do_one_pass()
	ext4: avoid crash when inline data creation follows DIO write
	ext4: fix null-ptr-deref in ext4_write_info
	ext4: make ext4_lazyinit_thread freezable
	ext4: fix check for block being out of directory size
	ext4: don't increase iversion counter for ea_inodes
	ext4: ext4_read_bh_lock() should submit IO if the buffer isn't uptodate
	ext4: place buffer head allocation before handle start
	ext4: fix dir corruption when ext4_dx_add_entry() fails
	ext4: fix miss release buffer head in ext4_fc_write_inode
	ext4: fix potential memory leak in ext4_fc_record_modified_inode()
	ext4: fix potential memory leak in ext4_fc_record_regions()
	ext4: update 'state->fc_regions_size' after successful memory allocation
	livepatch: fix race between fork and KLP transition
	ftrace: Properly unset FTRACE_HASH_FL_MOD
	ring-buffer: Allow splice to read previous partially read pages
	ring-buffer: Have the shortest_full queue be the shortest not longest
	ring-buffer: Check pending waiters when doing wake ups as well
	ring-buffer: Add ring_buffer_wake_waiters()
	ring-buffer: Fix race between reset page and reading page
	tracing: Disable interrupt or preemption before acquiring arch_spinlock_t
	tracing: Wake up ring buffer waiters on closing of the file
	tracing: Wake up waiters when tracing is disabled
	tracing: Add ioctl() to force ring buffer waiters to wake up
	tracing: Move duplicate code of trace_kprobe/eprobe.c into header
	tracing: Add "(fault)" name injection to kernel probes
	tracing: Fix reading strings from synthetic events
	thunderbolt: Explicitly enable lane adapter hotplug events at startup
	efi: libstub: drop pointless get_memory_map() call
	media: cedrus: Set the platform driver data earlier
	media: cedrus: Fix endless loop in cedrus_h265_skip_bits()
	blk-wbt: call rq_qos_add() after wb_normal is initialized
	KVM: x86/emulator: Fix handing of POP SS to correctly set interruptibility
	KVM: nVMX: Unconditionally purge queued/injected events on nested "exit"
	KVM: nVMX: Don't propagate vmcs12's PERF_GLOBAL_CTRL settings to vmcs02
	KVM: VMX: Drop bits 31:16 when shoving exception error code into VMCS
	staging: greybus: audio_helper: remove unused and wrong debugfs usage
	drm/nouveau/kms/nv140-: Disable interlacing
	drm/nouveau: fix a use-after-free in nouveau_gem_prime_import_sg_table()
	drm/i915: Fix watermark calculations for gen12+ RC CCS modifier
	drm/i915: Fix watermark calculations for gen12+ MC CCS modifier
	drm/i915: Fix watermark calculations for gen12+ CCS+CC modifier
	drm/amd/display: Fix vblank refcount in vrr transition
	smb3: must initialize two ACL struct fields to zero
	selinux: use "grep -E" instead of "egrep"
	ima: fix blocking of security.ima xattrs of unsupported algorithms
	userfaultfd: open userfaultfds with O_RDONLY
	ntfs3: rework xattr handlers and switch to POSIX ACL VFS helpers
	thermal: cpufreq_cooling: Check the policy first in cpufreq_cooling_register()
	sh: machvec: Use char[] for section boundaries
	MIPS: SGI-IP27: Free some unused memory
	MIPS: SGI-IP27: Fix platform-device leak in bridge_platform_create()
	ARM: 9244/1: dump: Fix wrong pg_level in walk_pmd()
	ARM: 9247/1: mm: set readonly for MT_MEMORY_RO with ARM_LPAE
	objtool: Preserve special st_shndx indexes in elf_update_symbol
	nfsd: Fix a memory leak in an error handling path
	SUNRPC: Fix svcxdr_init_decode's end-of-buffer calculation
	SUNRPC: Fix svcxdr_init_encode's buflen calculation
	NFSD: Protect against send buffer overflow in NFSv2 READDIR
	NFSD: Fix handling of oversized NFSv4 COMPOUND requests
	wifi: rtlwifi: 8192de: correct checking of IQK reload
	wifi: ath10k: add peer map clean up for peer delete in ath10k_sta_state()
	leds: lm3601x: Don't use mutex after it was destroyed
	bpf: Fix reference state management for synchronous callbacks
	wifi: mac80211: allow bw change during channel switch in mesh
	bpftool: Fix a wrong type cast in btf_dumper_int
	spi: mt7621: Fix an error message in mt7621_spi_probe()
	x86/resctrl: Fix to restore to original value when re-enabling hardware prefetch register
	xsk: Fix backpressure mechanism on Tx
	bpf: Disable preemption when increasing per-cpu map_locked
	bpf: Propagate error from htab_lock_bucket() to userspace
	bpf: Use this_cpu_{inc|dec|inc_return} for bpf_task_storage_busy
	Bluetooth: btusb: mediatek: fix WMT failure during runtime suspend
	wifi: rtl8xxxu: tighten bounds checking in rtl8xxxu_read_efuse()
	wifi: rtw88: add missing destroy_workqueue() on error path in rtw_core_init()
	selftests/xsk: Avoid use-after-free on ctx
	spi: qup: add missing clk_disable_unprepare on error in spi_qup_resume()
	spi: qup: add missing clk_disable_unprepare on error in spi_qup_pm_resume_runtime()
	wifi: rtl8xxxu: Fix skb misuse in TX queue selection
	spi: meson-spicc: do not rely on busy flag in pow2 clk ops
	bpf: btf: fix truncated last_member_type_id in btf_struct_resolve
	wifi: rtl8xxxu: gen2: Fix mistake in path B IQ calibration
	wifi: rtl8xxxu: Remove copy-paste leftover in gen2_update_rate_mask
	wifi: mt76: sdio: fix transmitting packet hangs
	wifi: mt76: mt7615: add mt7615_mutex_acquire/release in mt7615_sta_set_decap_offload
	wifi: mt76: mt7915: do not check state before configuring implicit beamform
	Bluetooth: RFCOMM: Fix possible deadlock on socket shutdown/release
	net: fs_enet: Fix wrong check in do_pd_setup
	bpf: Ensure correct locking around vulnerable function find_vpid()
	Bluetooth: hci_{ldisc,serdev}: check percpu_init_rwsem() failure
	netfilter: conntrack: fix the gc rescheduling delay
	netfilter: conntrack: revisit the gc initial rescheduling bias
	wifi: ath11k: fix number of VHT beamformee spatial streams
	x86/microcode/AMD: Track patch allocation size explicitly
	x86/cpu: Include the header of init_ia32_feat_ctl()'s prototype
	spi: dw: Fix PM disable depth imbalance in dw_spi_bt1_probe
	spi/omap100k:Fix PM disable depth imbalance in omap1_spi100k_probe
	skmsg: Schedule psock work if the cached skb exists on the psock
	i2c: mlxbf: support lock mechanism
	Bluetooth: hci_core: Fix not handling link timeouts propertly
	xfrm: Reinject transport-mode packets through workqueue
	netfilter: nft_fib: Fix for rpath check with VRF devices
	spi: s3c64xx: Fix large transfers with DMA
	wifi: rtl8xxxu: Fix AIFS written to REG_EDCA_*_PARAM
	vhost/vsock: Use kvmalloc/kvfree for larger packets.
	eth: alx: take rtnl_lock on resume
	mISDN: fix use-after-free bugs in l1oip timer handlers
	sctp: handle the error returned from sctp_auth_asoc_init_active_key
	tcp: fix tcp_cwnd_validate() to not forget is_cwnd_limited
	spi: Ensure that sg_table won't be used after being freed
	hwmon: (pmbus/mp2888) Fix sensors readouts for MPS Multi-phase mp2888 controller
	net: rds: don't hold sock lock when cancelling work from rds_tcp_reset_callbacks()
	bnx2x: fix potential memory leak in bnx2x_tpa_stop()
	net: wwan: iosm: Call mutex_init before locking it
	net/ieee802154: reject zero-sized raw_sendmsg()
	once: add DO_ONCE_SLOW() for sleepable contexts
	net: mvpp2: fix mvpp2 debugfs leak
	drm: bridge: adv7511: fix CEC power down control register offset
	drm: bridge: adv7511: unregister cec i2c device after cec adapter
	drm/bridge: Avoid uninitialized variable warning
	drm/mipi-dsi: Detach devices when removing the host
	drm/virtio: Correct drm_gem_shmem_get_sg_table() error handling
	drm/bridge: parade-ps8640: Fix regulator supply order
	drm/dp_mst: fix drm_dp_dpcd_read return value checks
	drm:pl111: Add of_node_put() when breaking out of for_each_available_child_of_node()
	ASoC: mt6359: fix tests for platform_get_irq() failure
	platform/chrome: fix double-free in chromeos_laptop_prepare()
	platform/chrome: fix memory corruption in ioctl
	ASoC: tas2764: Allow mono streams
	ASoC: tas2764: Drop conflicting set_bias_level power setting
	ASoC: tas2764: Fix mute/unmute
	platform/x86: msi-laptop: Fix old-ec check for backlight registering
	platform/x86: msi-laptop: Fix resource cleanup
	platform/chrome: cros_ec_typec: Correct alt mode index
	drm/amdgpu: add missing pci_disable_device() in amdgpu_pmops_runtime_resume()
	drm/bridge: megachips: Fix a null pointer dereference bug
	ASoC: rsnd: Add check for rsnd_mod_power_on
	ALSA: hda: beep: Simplify keep-power-at-enable behavior
	drm/bochs: fix blanking
	drm/omap: dss: Fix refcount leak bugs
	drm/amdgpu: Fix memory leak in hpd_rx_irq_create_workqueue()
	mmc: au1xmmc: Fix an error handling path in au1xmmc_probe()
	ASoC: eureka-tlv320: Hold reference returned from of_find_xxx API
	drm/msm/dpu: index dpu_kms->hw_vbif using vbif_idx
	drm/msm/dp: correct 1.62G link rate at dp_catalog_ctrl_config_msa()
	drm/vmwgfx: Fix memory leak in vmw_mksstat_add_ioctl()
	ASoC: codecs: tx-macro: fix kcontrol put
	ASoC: da7219: Fix an error handling path in da7219_register_dai_clks()
	ALSA: dmaengine: increment buffer pointer atomically
	mmc: wmt-sdmmc: Fix an error handling path in wmt_mci_probe()
	ASoC: wm8997: Fix PM disable depth imbalance in wm8997_probe
	ASoC: wm5110: Fix PM disable depth imbalance in wm5110_probe
	ASoC: wm5102: Fix PM disable depth imbalance in wm5102_probe
	ASoC: mt6660: Fix PM disable depth imbalance in mt6660_i2c_probe
	ALSA: hda/hdmi: Don't skip notification handling during PM operation
	memory: pl353-smc: Fix refcount leak bug in pl353_smc_probe()
	memory: of: Fix refcount leak bug in of_get_ddr_timings()
	memory: of: Fix refcount leak bug in of_lpddr3_get_ddr_timings()
	locks: fix TOCTOU race when granting write lease
	soc: qcom: smsm: Fix refcount leak bugs in qcom_smsm_probe()
	soc: qcom: smem_state: Add refcounting for the 'state->of_node'
	ARM: dts: imx6qdl-kontron-samx6i: hook up DDC i2c bus
	ARM: dts: turris-omnia: Fix mpp26 pin name and comment
	ARM: dts: kirkwood: lsxl: fix serial line
	ARM: dts: kirkwood: lsxl: remove first ethernet port
	ia64: export memory_add_physaddr_to_nid to fix cxl build error
	soc/tegra: fuse: Drop Kconfig dependency on TEGRA20_APB_DMA
	arm64: dts: ti: k3-j7200: fix main pinmux range
	ARM: dts: exynos: correct s5k6a3 reset polarity on Midas family
	ARM: Drop CMDLINE_* dependency on ATAGS
	ext4: don't run ext4lazyinit for read-only filesystems
	arm64: ftrace: fix module PLTs with mcount
	ARM: dts: exynos: fix polarity of VBUS GPIO of Origen
	iio: adc: at91-sama5d2_adc: fix AT91_SAMA5D2_MR_TRACKTIM_MAX
	iio: adc: at91-sama5d2_adc: check return status for pressure and touch
	iio: adc: at91-sama5d2_adc: lock around oversampling and sample freq
	iio: adc: at91-sama5d2_adc: disable/prepare buffer on suspend/resume
	iio: inkern: only release the device node when done with it
	iio: inkern: fix return value in devm_of_iio_channel_get_by_name()
	iio: ABI: Fix wrong format of differential capacitance channel ABI.
	iio: magnetometer: yas530: Change data type of hard_offsets to signed
	RDMA/mlx5: Don't compare mkey tags in DEVX indirect mkey
	usb: common: debug: Check non-standard control requests
	clk: meson: Hold reference returned by of_get_parent()
	clk: oxnas: Hold reference returned by of_get_parent()
	clk: qoriq: Hold reference returned by of_get_parent()
	clk: berlin: Add of_node_put() for of_get_parent()
	clk: sprd: Hold reference returned by of_get_parent()
	clk: tegra: Fix refcount leak in tegra210_clock_init
	clk: tegra: Fix refcount leak in tegra114_clock_init
	clk: tegra20: Fix refcount leak in tegra20_clock_init
	HSI: omap_ssi: Fix refcount leak in ssi_probe
	HSI: omap_ssi_port: Fix dma_map_sg error check
	media: exynos4-is: fimc-is: Add of_node_put() when breaking out of loop
	tty: xilinx_uartps: Fix the ignore_status
	media: meson: vdec: add missing clk_disable_unprepare on error in vdec_hevc_start()
	media: uvcvideo: Fix memory leak in uvc_gpio_parse
	media: uvcvideo: Use entity get_cur in uvc_ctrl_set
	media: xilinx: vipp: Fix refcount leak in xvip_graph_dma_init
	RDMA/rxe: Fix "kernel NULL pointer dereference" error
	RDMA/rxe: Fix the error caused by qp->sk
	misc: ocxl: fix possible refcount leak in afu_ioctl()
	fpga: prevent integer overflow in dfl_feature_ioctl_set_irq()
	dmaengine: hisilicon: Disable channels when unregister hisi_dma
	dmaengine: hisilicon: Fix CQ head update
	dmaengine: hisilicon: Add multi-thread support for a DMA channel
	dyndbg: fix static_branch manipulation
	dyndbg: fix module.dyndbg handling
	dyndbg: let query-modname override actual module name
	dyndbg: drop EXPORTed dynamic_debug_exec_queries
	clk: qcom: sm6115: Select QCOM_GDSC
	mtd: devices: docg3: check the return value of devm_ioremap() in the probe
	phy: amlogic: phy-meson-axg-mipi-pcie-analog: Hold reference returned by of_get_parent()
	phy: phy-mtk-tphy: fix the phy type setting issue
	mtd: rawnand: intel: Read the chip-select line from the correct OF node
	mtd: rawnand: intel: Remove undocumented compatible string
	mtd: rawnand: fsl_elbc: Fix none ECC mode
	RDMA/irdma: Align AE id codes to correct flush code and event
	RDMA/srp: Fix srp_abort()
	RDMA/siw: Always consume all skbuf data in sk_data_ready() upcall.
	RDMA/siw: Fix QP destroy to wait for all references dropped.
	ata: fix ata_id_sense_reporting_enabled() and ata_id_has_sense_reporting()
	ata: fix ata_id_has_devslp()
	ata: fix ata_id_has_ncq_autosense()
	ata: fix ata_id_has_dipm()
	mtd: rawnand: meson: fix bit map use in meson_nfc_ecc_correct()
	md: Replace snprintf with scnprintf
	md/raid5: Ensure stripe_fill happens on non-read IO with journal
	md/raid5: Remove unnecessary bio_put() in raid5_read_one_chunk()
	RDMA/cm: Use SLID in the work completion as the DLID in responder side
	IB: Set IOVA/LENGTH on IB_MR in core/uverbs layers
	xhci: Don't show warning for reinit on known broken suspend
	usb: gadget: function: fix dangling pnp_string in f_printer.c
	drivers: serial: jsm: fix some leaks in probe
	serial: 8250: Toggle IER bits on only after irq has been set up
	tty: serial: fsl_lpuart: disable dma rx/tx use flags in lpuart_dma_shutdown
	phy: qualcomm: call clk_disable_unprepare in the error handling
	staging: vt6655: fix some erroneous memory clean-up loops
	slimbus: qcom-ngd-ctrl: allow compile testing without QCOM_RPROC_COMMON
	firmware: google: Test spinlock on panic path to avoid lockups
	serial: 8250: Fix restoring termios speed after suspend
	scsi: libsas: Fix use-after-free bug in smp_execute_task_sg()
	scsi: iscsi: Rename iscsi_conn_queue_work()
	scsi: iscsi: Add recv workqueue helpers
	scsi: iscsi: Run recv path from workqueue
	scsi: iscsi: iscsi_tcp: Fix null-ptr-deref while calling getpeername()
	clk: qcom: apss-ipq6018: mark apcs_alias0_core_clk as critical
	clk: qcom: gcc-sm6115: Override default Alpha PLL regs
	RDMA/rxe: Fix resize_finish() in rxe_queue.c
	fsi: core: Check error number after calling ida_simple_get
	mfd: intel_soc_pmic: Fix an error handling path in intel_soc_pmic_i2c_probe()
	mfd: fsl-imx25: Fix an error handling path in mx25_tsadc_setup_irq()
	mfd: lp8788: Fix an error handling path in lp8788_probe()
	mfd: lp8788: Fix an error handling path in lp8788_irq_init() and lp8788_irq_init()
	mfd: fsl-imx25: Fix check for platform_get_irq() errors
	mfd: sm501: Add check for platform_driver_register()
	clk: mediatek: mt8183: mfgcfg: Propagate rate changes to parent
	dmaengine: ioat: stop mod_timer from resurrecting deleted timer in __cleanup()
	usb: mtu3: fix failed runtime suspend in host only mode
	spmi: pmic-arb: correct duplicate APID to PPID mapping logic
	clk: vc5: Fix 5P49V6901 outputs disabling when enabling FOD
	clk: baikal-t1: Fix invalid xGMAC PTP clock divider
	clk: baikal-t1: Add shared xGMAC ref/ptp clocks internal parent
	clk: baikal-t1: Add SATA internal ref clock buffer
	clk: bcm2835: fix bcm2835_clock_rate_from_divisor declaration
	clk: imx: scu: fix memleak on platform_device_add() fails
	clk: ti: dra7-atl: Fix reference leak in of_dra7_atl_clk_probe
	clk: ast2600: BCLK comes from EPLL
	mailbox: mpfs: fix handling of the reg property
	mailbox: mpfs: account for mbox offsets while sending
	mailbox: bcm-ferxrm-mailbox: Fix error check for dma_map_sg
	powerpc/configs: Properly enable PAPR_SCM in pseries_defconfig
	powerpc/math_emu/efp: Include module.h
	powerpc/sysdev/fsl_msi: Add missing of_node_put()
	powerpc/pci_dn: Add missing of_node_put()
	powerpc/powernv: add missing of_node_put() in opal_export_attrs()
	powerpc: Fix fallocate and fadvise64_64 compat parameter combination
	x86/hyperv: Fix 'struct hv_enlightened_vmcs' definition
	powerpc/64s: Fix GENERIC_CPU build flags for PPC970 / G5
	powerpc: Fix SPE Power ISA properties for e500v1 platforms
	powerpc/kprobes: Fix null pointer reference in arch_prepare_kprobe()
	powerpc/pseries/vas: Pass hw_cpu_id to node associativity HCALL
	crypto: sahara - don't sleep when in softirq
	crypto: hisilicon/zip - fix mismatch in get/set sgl_sge_nr
	hwrng: arm-smccc-trng - fix NO_ENTROPY handling
	cgroup: Honor caller's cgroup NS when resolving path
	hwrng: imx-rngc - Moving IRQ handler registering after imx_rngc_irq_mask_clear()
	crypto: qat - fix default value of WDT timer
	crypto: hisilicon/qm - fix missing put dfx access
	cgroup/cpuset: Enable update_tasks_cpumask() on top_cpuset
	iommu/omap: Fix buffer overflow in debugfs
	crypto: akcipher - default implementation for setting a private key
	crypto: ccp - Release dma channels before dmaengine unrgister
	crypto: inside-secure - Change swab to swab32
	crypto: qat - fix DMA transfer direction
	cifs: return correct error in ->calc_signature()
	iommu/iova: Fix module config properly
	tracing: kprobe: Fix kprobe event gen test module on exit
	tracing: kprobe: Make gen test module work in arm and riscv
	tracing/osnoise: Fix possible recursive locking in stop_per_cpu_kthreads
	kbuild: remove the target in signal traps when interrupted
	kbuild: rpm-pkg: fix breakage when V=1 is used
	crypto: marvell/octeontx - prevent integer overflows
	crypto: cavium - prevent integer overflow loading firmware
	thermal/drivers/qcom/tsens-v0_1: Fix MSM8939 fourth sensor hw_id
	ACPI: APEI: do not add task_work to kernel thread to avoid memory leak
	f2fs: fix race condition on setting FI_NO_EXTENT flag
	f2fs: fix to account FS_CP_DATA_IO correctly
	selftest: tpm2: Add Client.__del__() to close /dev/tpm* handle
	fs: dlm: fix race in lowcomms
	rcu: Avoid triggering strict-GP irq-work when RCU is idle
	rcu: Back off upon fill_page_cache_func() allocation failure
	rcu-tasks: Convert RCU_LOCKDEP_WARN() to WARN_ONCE()
	ACPI: video: Add Toshiba Satellite/Portege Z830 quirk
	ACPI: tables: FPDT: Don't call acpi_os_map_memory() on invalid phys address
	cpufreq: intel_pstate: Add Tigerlake support in no-HWP mode
	MIPS: BCM47XX: Cast memcmp() of function to (void *)
	powercap: intel_rapl: fix UBSAN shift-out-of-bounds issue
	thermal: intel_powerclamp: Use get_cpu() instead of smp_processor_id() to avoid crash
	ARM: decompressor: Include .data.rel.ro.local
	ACPI: x86: Add a quirk for Dell Inspiron 14 2-in-1 for StorageD3Enable
	x86/entry: Work around Clang __bdos() bug
	NFSD: Return nfserr_serverfault if splice_ok but buf->pages have data
	NFSD: fix use-after-free on source server when doing inter-server copy
	wifi: brcmfmac: fix invalid address access when enabling SCAN log level
	bpftool: Clear errno after libcap's checks
	ice: set tx_tstamps when creating new Tx rings via ethtool
	net: ethernet: ti: davinci_mdio: Add workaround for errata i2329
	openvswitch: Fix double reporting of drops in dropwatch
	openvswitch: Fix overreporting of drops in dropwatch
	tcp: annotate data-race around tcp_md5sig_pool_populated
	x86/mce: Retrieve poison range from hardware
	wifi: ath9k: avoid uninit memory read in ath9k_htc_rx_msg()
	thunderbolt: Add back Intel Falcon Ridge end-to-end flow control workaround
	xfrm: Update ipcomp_scratches with NULL when freed
	iavf: Fix race between iavf_close and iavf_reset_task
	wifi: brcmfmac: fix use-after-free bug in brcmf_netdev_start_xmit()
	Bluetooth: btintel: Mark Intel controller to support LE_STATES quirk
	regulator: core: Prevent integer underflow
	wifi: mt76: mt7921: reset msta->airtime_ac while clearing up hw value
	Bluetooth: L2CAP: initialize delayed works at l2cap_chan_create()
	Bluetooth: hci_sysfs: Fix attempting to call device_add multiple times
	can: bcm: check the result of can_send() in bcm_can_tx()
	wifi: rt2x00: don't run Rt5592 IQ calibration on MT7620
	wifi: rt2x00: set correct TX_SW_CFG1 MAC register for MT7620
	wifi: rt2x00: set VGC gain for both chains of MT7620
	wifi: rt2x00: set SoC wmac clock register
	wifi: rt2x00: correctly set BBP register 86 for MT7620
	hwmon: (sht4x) do not overflow clamping operation on 32-bit platforms
	net: If sock is dead don't access sock's sk_wq in sk_stream_wait_memory
	Bluetooth: L2CAP: Fix user-after-free
	r8152: Rate limit overflow messages
	drm/nouveau/nouveau_bo: fix potential memory leak in nouveau_bo_alloc()
	drm: Use size_t type for len variable in drm_copy_field()
	drm: Prevent drm_copy_field() to attempt copying a NULL pointer
	drm/komeda: Fix handling of atomic commits in the atomic_commit_tail hook
	gpu: lontium-lt9611: Fix NULL pointer dereference in lt9611_connector_init()
	drm/amd/display: fix overflow on MIN_I64 definition
	udmabuf: Set ubuf->sg = NULL if the creation of sg table fails
	drm: bridge: dw_hdmi: only trigger hotplug event on link change
	ALSA: usb-audio: Register card at the last interface
	drm/vc4: vec: Fix timings for VEC modes
	drm: panel-orientation-quirks: Add quirk for Anbernic Win600
	platform/chrome: cros_ec: Notify the PM of wake events during resume
	platform/x86: msi-laptop: Change DMI match / alias strings to fix module autoloading
	ASoC: SOF: pci: Change DMI match info to support all Chrome platforms
	drm/amdgpu: fix initial connector audio value
	drm/meson: reorder driver deinit sequence to fix use-after-free bug
	drm/meson: explicitly remove aggregate driver at module unload time
	mmc: sdhci-msm: add compatible string check for sdm670
	drm/dp: Don't rewrite link config when setting phy test pattern
	drm/amd/display: Remove interface for periodic interrupt 1
	ARM: dts: imx7d-sdb: config the max pressure for tsc2046
	ARM: dts: imx6q: add missing properties for sram
	ARM: dts: imx6dl: add missing properties for sram
	ARM: dts: imx6qp: add missing properties for sram
	ARM: dts: imx6sl: add missing properties for sram
	ARM: dts: imx6sll: add missing properties for sram
	ARM: dts: imx6sx: add missing properties for sram
	kselftest/arm64: Fix validatation termination record after EXTRA_CONTEXT
	arm64: dts: imx8mq-librem5: Add bq25895 as max17055's power supply
	btrfs: dump extra info if one free space cache has more bitmaps than it should
	btrfs: scrub: try to fix super block errors
	btrfs: don't print information about space cache or tree every remount
	ARM: 9242/1: kasan: Only map modules if CONFIG_KASAN_VMALLOC=n
	clk: zynqmp: Fix stack-out-of-bounds in strncpy`
	media: cx88: Fix a null-ptr-deref bug in buffer_prepare()
	media: platform: fix some double free in meson-ge2d and mtk-jpeg and s5p-mfc
	clk: zynqmp: pll: rectify rate rounding in zynqmp_pll_round_rate
	usb: host: xhci-plat: suspend and resume clocks
	usb: host: xhci-plat: suspend/resume clks for brcm
	dmaengine: ti: k3-udma: Reset UDMA_CHAN_RT byte counters to prevent overflow
	scsi: 3w-9xxx: Avoid disabling device if failing to enable it
	nbd: Fix hung when signal interrupts nbd_start_device_ioctl()
	iommu/arm-smmu-v3: Make default domain type of HiSilicon PTT device to identity
	power: supply: adp5061: fix out-of-bounds read in adp5061_get_chg_type()
	staging: vt6655: fix potential memory leak
	blk-throttle: prevent overflow while calculating wait time
	ata: libahci_platform: Sanity check the DT child nodes number
	bcache: fix set_at_max_writeback_rate() for multiple attached devices
	soundwire: cadence: Don't overwrite msg->buf during write commands
	soundwire: intel: fix error handling on dai registration issues
	HID: roccat: Fix use-after-free in roccat_read()
	eventfd: guard wake_up in eventfd fs calls as well
	md/raid5: Wait for MD_SB_CHANGE_PENDING in raid5d
	usb: host: xhci: Fix potential memory leak in xhci_alloc_stream_info()
	usb: musb: Fix musb_gadget.c rxstate overflow bug
	arm64: dts: imx8mp: Add snps,gfladj-refclk-lpm-sel quirk to USB nodes
	usb: dwc3: core: Enable GUCTL1 bit 10 for fixing termination error after resume bug
	Revert "usb: storage: Add quirk for Samsung Fit flash"
	staging: rtl8723bs: fix potential memory leak in rtw_init_drv_sw()
	staging: rtl8723bs: fix a potential memory leak in rtw_init_cmd_priv()
	scsi: tracing: Fix compile error in trace_array calls when TRACING is disabled
	ext2: Use kvmalloc() for group descriptor array
	nvme: copy firmware_rev on each init
	nvmet-tcp: add bounds check on Transfer Tag
	usb: idmouse: fix an uninit-value in idmouse_open
	clk: bcm2835: Make peripheral PLLC critical
	clk: bcm2835: Round UART input clock up
	perf intel-pt: Fix segfault in intel_pt_print_info() with uClibc
	io_uring/af_unix: defer registered files gc to io_uring release
	io_uring: correct pinned_vm accounting
	io_uring/rw: fix short rw error handling
	io_uring/rw: fix error'ed retry return values
	io_uring/rw: fix unexpected link breakage
	mm: hugetlb: fix UAF in hugetlb_handle_userfault
	net: ieee802154: return -EINVAL for unknown addr type
	ALSA: usb-audio: Fix last interface check for registration
	blk-wbt: fix that 'rwb->wc' is always set to 1 in wbt_init()
	net: ethernet: ti: davinci_mdio: fix build for mdio bitbang uses
	Revert "net/ieee802154: reject zero-sized raw_sendmsg()"
	net/ieee802154: don't warn zero-sized raw_sendmsg()
	drm/amd/display: Fix build breakage with CONFIG_DEBUG_FS=n
	Kconfig.debug: simplify the dependency of DEBUG_INFO_DWARF4/5
	Kconfig.debug: add toolchain checks for DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT
	lib/Kconfig.debug: Add check for non-constant .{s,u}leb128 support to DWARF5
	ext4: continue to expand file system when the target size doesn't reach
	thermal: intel_powerclamp: Use first online CPU as control_cpu
	gcov: support GCC 12.1 and newer compilers
	io-wq: Fix memory leak in worker creation
	Linux 5.15.75

Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
Change-Id: Id3dfe8285e830d01df7adb33729e78a75c1cac1a
2022-11-02 08:51:19 +01:00

4812 lines
114 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
*/
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
#include <linux/bpf_lirc.h>
#include <linux/bpf_verifier.h>
#include <linux/btf.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/sched/signal.h>
#include <linux/vmalloc.h>
#include <linux/mmzone.h>
#include <linux/anon_inodes.h>
#include <linux/fdtable.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/license.h>
#include <linux/filter.h>
#include <linux/kernel.h>
#include <linux/idr.h>
#include <linux/cred.h>
#include <linux/timekeeping.h>
#include <linux/ctype.h>
#include <linux/nospec.h>
#include <linux/audit.h>
#include <uapi/linux/btf.h>
#include <linux/pgtable.h>
#include <linux/bpf_lsm.h>
#include <linux/poll.h>
#include <linux/bpf-netns.h>
#include <linux/rcupdate_trace.h>
#include <linux/memcontrol.h>
#include <trace/hooks/syscall_check.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
#define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY)
#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \
IS_FD_HASH(map))
#define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY)
DEFINE_PER_CPU(int, bpf_prog_active);
static DEFINE_IDR(prog_idr);
static DEFINE_SPINLOCK(prog_idr_lock);
static DEFINE_IDR(map_idr);
static DEFINE_SPINLOCK(map_idr_lock);
static DEFINE_IDR(link_idr);
static DEFINE_SPINLOCK(link_idr_lock);
int sysctl_unprivileged_bpf_disabled __read_mostly =
IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0;
static const struct bpf_map_ops * const bpf_map_types[] = {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
#define BPF_MAP_TYPE(_id, _ops) \
[_id] = &_ops,
#define BPF_LINK_TYPE(_id, _name)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
#undef BPF_LINK_TYPE
};
/*
* If we're handed a bigger struct than we know of, ensure all the unknown bits
* are 0 - i.e. new user-space does not rely on any kernel feature extensions
* we don't know about yet.
*
* There is a ToCToU between this function call and the following
* copy_from_user() call. However, this is not a concern since this function is
* meant to be a future-proofing of bits.
*/
int bpf_check_uarg_tail_zero(bpfptr_t uaddr,
size_t expected_size,
size_t actual_size)
{
int res;
if (unlikely(actual_size > PAGE_SIZE)) /* silly large */
return -E2BIG;
if (actual_size <= expected_size)
return 0;
if (uaddr.is_kernel)
res = memchr_inv(uaddr.kernel + expected_size, 0,
actual_size - expected_size) == NULL;
else
res = check_zeroed_user(uaddr.user + expected_size,
actual_size - expected_size);
if (res < 0)
return res;
return res ? 0 : -E2BIG;
}
const struct bpf_map_ops bpf_map_offload_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc = bpf_map_offload_map_alloc,
.map_free = bpf_map_offload_map_free,
.map_check_btf = map_check_no_btf,
};
static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
{
const struct bpf_map_ops *ops;
u32 type = attr->map_type;
struct bpf_map *map;
int err;
if (type >= ARRAY_SIZE(bpf_map_types))
return ERR_PTR(-EINVAL);
type = array_index_nospec(type, ARRAY_SIZE(bpf_map_types));
ops = bpf_map_types[type];
if (!ops)
return ERR_PTR(-EINVAL);
if (ops->map_alloc_check) {
err = ops->map_alloc_check(attr);
if (err)
return ERR_PTR(err);
}
if (attr->map_ifindex)
ops = &bpf_map_offload_ops;
map = ops->map_alloc(attr);
if (IS_ERR(map))
return map;
map->ops = ops;
map->map_type = type;
return map;
}
static void bpf_map_write_active_inc(struct bpf_map *map)
{
atomic64_inc(&map->writecnt);
}
static void bpf_map_write_active_dec(struct bpf_map *map)
{
atomic64_dec(&map->writecnt);
}
bool bpf_map_write_active(const struct bpf_map *map)
{
return atomic64_read(&map->writecnt) != 0;
}
static u32 bpf_map_value_size(const struct bpf_map *map)
{
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
return round_up(map->value_size, 8) * num_possible_cpus();
else if (IS_FD_MAP(map))
return sizeof(u32);
else
return map->value_size;
}
static void maybe_wait_bpf_programs(struct bpf_map *map)
{
/* Wait for any running BPF programs to complete so that
* userspace, when we return to it, knows that all programs
* that could be running use the new map value.
*/
if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
synchronize_rcu();
}
static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
void *value, __u64 flags)
{
int err;
/* Need to create a kthread, thus must support schedule */
if (bpf_map_is_dev_bound(map)) {
return bpf_map_offload_update_elem(map, key, value, flags);
} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
return map->ops->map_update_elem(map, key, value, flags);
} else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
map->map_type == BPF_MAP_TYPE_SOCKMAP) {
return sock_map_update_elem_sys(map, key, value, flags);
} else if (IS_FD_PROG_ARRAY(map)) {
return bpf_fd_array_map_update_elem(map, f.file, key, value,
flags);
}
bpf_disable_instrumentation();
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
err = bpf_percpu_hash_update(map, key, value, flags);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_update(map, key, value, flags);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
err = bpf_percpu_cgroup_storage_update(map, key, value,
flags);
} else if (IS_FD_ARRAY(map)) {
rcu_read_lock();
err = bpf_fd_array_map_update_elem(map, f.file, key, value,
flags);
rcu_read_unlock();
} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
rcu_read_lock();
err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
flags);
rcu_read_unlock();
} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
/* rcu_read_lock() is not needed */
err = bpf_fd_reuseport_array_update_elem(map, key, value,
flags);
} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
map->map_type == BPF_MAP_TYPE_STACK) {
err = map->ops->map_push_elem(map, value, flags);
} else {
rcu_read_lock();
err = map->ops->map_update_elem(map, key, value, flags);
rcu_read_unlock();
}
bpf_enable_instrumentation();
maybe_wait_bpf_programs(map);
return err;
}
static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
__u64 flags)
{
void *ptr;
int err;
if (bpf_map_is_dev_bound(map))
return bpf_map_offload_lookup_elem(map, key, value);
bpf_disable_instrumentation();
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
err = bpf_percpu_hash_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
err = bpf_percpu_cgroup_storage_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
err = bpf_stackmap_copy(map, key, value);
} else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
err = bpf_fd_array_map_lookup_elem(map, key, value);
} else if (IS_FD_HASH(map)) {
err = bpf_fd_htab_map_lookup_elem(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
map->map_type == BPF_MAP_TYPE_STACK) {
err = map->ops->map_peek_elem(map, value);
} else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
/* struct_ops map requires directly updating "value" */
err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
} else {
rcu_read_lock();
if (map->ops->map_lookup_elem_sys_only)
ptr = map->ops->map_lookup_elem_sys_only(map, key);
else
ptr = map->ops->map_lookup_elem(map, key);
if (IS_ERR(ptr)) {
err = PTR_ERR(ptr);
} else if (!ptr) {
err = -ENOENT;
} else {
err = 0;
if (flags & BPF_F_LOCK)
/* lock 'ptr' and copy everything but lock */
copy_map_value_locked(map, value, ptr, true);
else
copy_map_value(map, value, ptr);
/* mask lock and timer, since value wasn't zero inited */
check_and_init_map_value(map, value);
}
rcu_read_unlock();
}
bpf_enable_instrumentation();
maybe_wait_bpf_programs(map);
return err;
}
/* Please, do not use this function outside from the map creation path
* (e.g. in map update path) without taking care of setting the active
* memory cgroup (see at bpf_map_kmalloc_node() for example).
*/
static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
{
/* We really just want to fail instead of triggering OOM killer
* under memory pressure, therefore we set __GFP_NORETRY to kmalloc,
* which is used for lower order allocation requests.
*
* It has been observed that higher order allocation requests done by
* vmalloc with __GFP_NORETRY being set might fail due to not trying
* to reclaim memory from the page cache, thus we set
* __GFP_RETRY_MAYFAIL to avoid such situations.
*/
const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_ACCOUNT;
unsigned int flags = 0;
unsigned long align = 1;
void *area;
if (size >= SIZE_MAX)
return NULL;
/* kmalloc()'ed memory can't be mmap()'ed */
if (mmapable) {
BUG_ON(!PAGE_ALIGNED(size));
align = SHMLBA;
flags = VM_USERMAP;
} else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY,
numa_node);
if (area != NULL)
return area;
}
return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL,
flags, numa_node, __builtin_return_address(0));
}
void *bpf_map_area_alloc(u64 size, int numa_node)
{
return __bpf_map_area_alloc(size, numa_node, false);
}
void *bpf_map_area_mmapable_alloc(u64 size, int numa_node)
{
return __bpf_map_area_alloc(size, numa_node, true);
}
void bpf_map_area_free(void *area)
{
kvfree(area);
}
static u32 bpf_map_flags_retain_permanent(u32 flags)
{
/* Some map creation flags are not tied to the map object but
* rather to the map fd instead, so they have no meaning upon
* map object inspection since multiple file descriptors with
* different (access) properties can exist here. Thus, given
* this has zero meaning for the map itself, lets clear these
* from here.
*/
return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY);
}
void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
{
map->map_type = attr->map_type;
map->key_size = attr->key_size;
map->value_size = attr->value_size;
map->max_entries = attr->max_entries;
map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags);
map->numa_node = bpf_map_attr_numa_node(attr);
}
static int bpf_map_alloc_id(struct bpf_map *map)
{
int id;
idr_preload(GFP_KERNEL);
spin_lock_bh(&map_idr_lock);
id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC);
if (id > 0)
map->id = id;
spin_unlock_bh(&map_idr_lock);
idr_preload_end();
if (WARN_ON_ONCE(!id))
return -ENOSPC;
return id > 0 ? 0 : id;
}
void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
{
unsigned long flags;
/* Offloaded maps are removed from the IDR store when their device
* disappears - even if someone holds an fd to them they are unusable,
* the memory is gone, all ops will fail; they are simply waiting for
* refcnt to drop to be freed.
*/
if (!map->id)
return;
if (do_idr_lock)
spin_lock_irqsave(&map_idr_lock, flags);
else
__acquire(&map_idr_lock);
idr_remove(&map_idr, map->id);
map->id = 0;
if (do_idr_lock)
spin_unlock_irqrestore(&map_idr_lock, flags);
else
__release(&map_idr_lock);
}
#ifdef CONFIG_MEMCG_KMEM
static void bpf_map_save_memcg(struct bpf_map *map)
{
map->memcg = get_mem_cgroup_from_mm(current->mm);
}
static void bpf_map_release_memcg(struct bpf_map *map)
{
mem_cgroup_put(map->memcg);
}
void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
int node)
{
struct mem_cgroup *old_memcg;
void *ptr;
old_memcg = set_active_memcg(map->memcg);
ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
set_active_memcg(old_memcg);
return ptr;
}
void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
{
struct mem_cgroup *old_memcg;
void *ptr;
old_memcg = set_active_memcg(map->memcg);
ptr = kzalloc(size, flags | __GFP_ACCOUNT);
set_active_memcg(old_memcg);
return ptr;
}
void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
size_t align, gfp_t flags)
{
struct mem_cgroup *old_memcg;
void __percpu *ptr;
old_memcg = set_active_memcg(map->memcg);
ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT);
set_active_memcg(old_memcg);
return ptr;
}
#else
static void bpf_map_save_memcg(struct bpf_map *map)
{
}
static void bpf_map_release_memcg(struct bpf_map *map)
{
}
#endif
/* called from workqueue */
static void bpf_map_free_deferred(struct work_struct *work)
{
struct bpf_map *map = container_of(work, struct bpf_map, work);
security_bpf_map_free(map);
bpf_map_release_memcg(map);
/* implementation dependent freeing */
map->ops->map_free(map);
}
static void bpf_map_put_uref(struct bpf_map *map)
{
if (atomic64_dec_and_test(&map->usercnt)) {
if (map->ops->map_release_uref)
map->ops->map_release_uref(map);
}
}
/* decrement map refcnt and schedule it for freeing via workqueue
* (unrelying map implementation ops->map_free() might sleep)
*/
static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
{
if (atomic64_dec_and_test(&map->refcnt)) {
/* bpf_map_free_id() must be called first */
bpf_map_free_id(map, do_idr_lock);
btf_put(map->btf);
INIT_WORK(&map->work, bpf_map_free_deferred);
schedule_work(&map->work);
}
}
void bpf_map_put(struct bpf_map *map)
{
__bpf_map_put(map, true);
}
EXPORT_SYMBOL_GPL(bpf_map_put);
void bpf_map_put_with_uref(struct bpf_map *map)
{
bpf_map_put_uref(map);
bpf_map_put(map);
}
static int bpf_map_release(struct inode *inode, struct file *filp)
{
struct bpf_map *map = filp->private_data;
if (map->ops->map_release)
map->ops->map_release(map, filp);
bpf_map_put_with_uref(map);
return 0;
}
static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
{
fmode_t mode = f.file->f_mode;
/* Our file permissions may have been overridden by global
* map permissions facing syscall side.
*/
if (READ_ONCE(map->frozen))
mode &= ~FMODE_CAN_WRITE;
return mode;
}
#ifdef CONFIG_PROC_FS
/* Provides an approximation of the map's memory footprint.
* Used only to provide a backward compatibility and display
* a reasonable "memlock" info.
*/
static unsigned long bpf_map_memory_footprint(const struct bpf_map *map)
{
unsigned long size;
size = round_up(map->key_size + bpf_map_value_size(map), 8);
return round_up(map->max_entries * size, PAGE_SIZE);
}
static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct bpf_map *map = filp->private_data;
const struct bpf_array *array;
u32 type = 0, jited = 0;
if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
array = container_of(map, struct bpf_array, map);
spin_lock(&array->aux->owner.lock);
type = array->aux->owner.type;
jited = array->aux->owner.jited;
spin_unlock(&array->aux->owner.lock);
}
seq_printf(m,
"map_type:\t%u\n"
"key_size:\t%u\n"
"value_size:\t%u\n"
"max_entries:\t%u\n"
"map_flags:\t%#x\n"
"memlock:\t%lu\n"
"map_id:\t%u\n"
"frozen:\t%u\n",
map->map_type,
map->key_size,
map->value_size,
map->max_entries,
map->map_flags,
bpf_map_memory_footprint(map),
map->id,
READ_ONCE(map->frozen));
if (type) {
seq_printf(m, "owner_prog_type:\t%u\n", type);
seq_printf(m, "owner_jited:\t%u\n", jited);
}
}
#endif
static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz,
loff_t *ppos)
{
/* We need this handler such that alloc_file() enables
* f_mode with FMODE_CAN_READ.
*/
return -EINVAL;
}
static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
size_t siz, loff_t *ppos)
{
/* We need this handler such that alloc_file() enables
* f_mode with FMODE_CAN_WRITE.
*/
return -EINVAL;
}
/* called for any extra memory-mapped regions (except initial) */
static void bpf_map_mmap_open(struct vm_area_struct *vma)
{
struct bpf_map *map = vma->vm_file->private_data;
if (vma->vm_flags & VM_MAYWRITE)
bpf_map_write_active_inc(map);
}
/* called for all unmapped memory region (including initial) */
static void bpf_map_mmap_close(struct vm_area_struct *vma)
{
struct bpf_map *map = vma->vm_file->private_data;
if (vma->vm_flags & VM_MAYWRITE)
bpf_map_write_active_dec(map);
}
static const struct vm_operations_struct bpf_map_default_vmops = {
.open = bpf_map_mmap_open,
.close = bpf_map_mmap_close,
};
static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct bpf_map *map = filp->private_data;
int err;
if (!map->ops->map_mmap || map_value_has_spin_lock(map) ||
map_value_has_timer(map))
return -ENOTSUPP;
if (!(vma->vm_flags & VM_SHARED))
return -EINVAL;
mutex_lock(&map->freeze_mutex);
if (vma->vm_flags & VM_WRITE) {
if (map->frozen) {
err = -EPERM;
goto out;
}
/* map is meant to be read-only, so do not allow mapping as
* writable, because it's possible to leak a writable page
* reference and allows user-space to still modify it after
* freezing, while verifier will assume contents do not change
*/
if (map->map_flags & BPF_F_RDONLY_PROG) {
err = -EACCES;
goto out;
}
}
/* set default open/close callbacks */
vma->vm_ops = &bpf_map_default_vmops;
vma->vm_private_data = map;
vma->vm_flags &= ~VM_MAYEXEC;
if (!(vma->vm_flags & VM_WRITE))
/* disallow re-mapping with PROT_WRITE */
vma->vm_flags &= ~VM_MAYWRITE;
err = map->ops->map_mmap(map, vma);
if (err)
goto out;
if (vma->vm_flags & VM_MAYWRITE)
bpf_map_write_active_inc(map);
out:
mutex_unlock(&map->freeze_mutex);
return err;
}
static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts)
{
struct bpf_map *map = filp->private_data;
if (map->ops->map_poll)
return map->ops->map_poll(map, filp, pts);
return EPOLLERR;
}
const struct file_operations bpf_map_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_map_show_fdinfo,
#endif
.release = bpf_map_release,
.read = bpf_dummy_read,
.write = bpf_dummy_write,
.mmap = bpf_map_mmap,
.poll = bpf_map_poll,
};
int bpf_map_new_fd(struct bpf_map *map, int flags)
{
int ret;
ret = security_bpf_map(map, OPEN_FMODE(flags));
if (ret < 0)
return ret;
return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
flags | O_CLOEXEC);
}
int bpf_get_file_flag(int flags)
{
if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
return -EINVAL;
if (flags & BPF_F_RDONLY)
return O_RDONLY;
if (flags & BPF_F_WRONLY)
return O_WRONLY;
return O_RDWR;
}
/* helper macro to check that unused fields 'union bpf_attr' are zero */
#define CHECK_ATTR(CMD) \
memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
sizeof(attr->CMD##_LAST_FIELD), 0, \
sizeof(*attr) - \
offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
sizeof(attr->CMD##_LAST_FIELD)) != NULL
/* dst and src must have at least "size" number of bytes.
* Return strlen on success and < 0 on error.
*/
int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size)
{
const char *end = src + size;
const char *orig_src = src;
memset(dst, 0, size);
/* Copy all isalnum(), '_' and '.' chars. */
while (src < end && *src) {
if (!isalnum(*src) &&
*src != '_' && *src != '.')
return -EINVAL;
*dst++ = *src++;
}
/* No '\0' found in "size" number of bytes */
if (src == end)
return -EINVAL;
return src - orig_src;
}
int map_check_no_btf(const struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
{
return -ENOTSUPP;
}
static int map_check_btf(struct bpf_map *map, const struct btf *btf,
u32 btf_key_id, u32 btf_value_id)
{
const struct btf_type *key_type, *value_type;
u32 key_size, value_size;
int ret = 0;
/* Some maps allow key to be unspecified. */
if (btf_key_id) {
key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
if (!key_type || key_size != map->key_size)
return -EINVAL;
} else {
key_type = btf_type_by_id(btf, 0);
if (!map->ops->map_check_btf)
return -EINVAL;
}
value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
if (!value_type || value_size != map->value_size)
return -EINVAL;
map->spin_lock_off = btf_find_spin_lock(btf, value_type);
if (map_value_has_spin_lock(map)) {
if (map->map_flags & BPF_F_RDONLY_PROG)
return -EACCES;
if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_ARRAY &&
map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
map->map_type != BPF_MAP_TYPE_TASK_STORAGE)
return -ENOTSUPP;
if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
map->value_size) {
WARN_ONCE(1,
"verifier bug spin_lock_off %d value_size %d\n",
map->spin_lock_off, map->value_size);
return -EFAULT;
}
}
map->timer_off = btf_find_timer(btf, value_type);
if (map_value_has_timer(map)) {
if (map->map_flags & BPF_F_RDONLY_PROG)
return -EACCES;
if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_LRU_HASH &&
map->map_type != BPF_MAP_TYPE_ARRAY)
return -EOPNOTSUPP;
}
if (map->ops->map_check_btf)
ret = map->ops->map_check_btf(map, btf, key_type, value_type);
return ret;
}
#define BPF_MAP_CREATE_LAST_FIELD btf_vmlinux_value_type_id
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_map *map;
int f_flags;
int err;
err = CHECK_ATTR(BPF_MAP_CREATE);
if (err)
return -EINVAL;
if (attr->btf_vmlinux_value_type_id) {
if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
attr->btf_key_type_id || attr->btf_value_type_id)
return -EINVAL;
} else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
return -EINVAL;
}
f_flags = bpf_get_file_flag(attr->map_flags);
if (f_flags < 0)
return f_flags;
if (numa_node != NUMA_NO_NODE &&
((unsigned int)numa_node >= nr_node_ids ||
!node_online(numa_node)))
return -EINVAL;
/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
map = find_and_alloc_map(attr);
if (IS_ERR(map))
return PTR_ERR(map);
err = bpf_obj_name_cpy(map->name, attr->map_name,
sizeof(attr->map_name));
if (err < 0)
goto free_map;
atomic64_set(&map->refcnt, 1);
atomic64_set(&map->usercnt, 1);
mutex_init(&map->freeze_mutex);
map->spin_lock_off = -EINVAL;
map->timer_off = -EINVAL;
if (attr->btf_key_type_id || attr->btf_value_type_id ||
/* Even the map's value is a kernel's struct,
* the bpf_prog.o must have BTF to begin with
* to figure out the corresponding kernel's
* counter part. Thus, attr->btf_fd has
* to be valid also.
*/
attr->btf_vmlinux_value_type_id) {
struct btf *btf;
btf = btf_get_by_fd(attr->btf_fd);
if (IS_ERR(btf)) {
err = PTR_ERR(btf);
goto free_map;
}
if (btf_is_kernel(btf)) {
btf_put(btf);
err = -EACCES;
goto free_map;
}
map->btf = btf;
if (attr->btf_value_type_id) {
err = map_check_btf(map, btf, attr->btf_key_type_id,
attr->btf_value_type_id);
if (err)
goto free_map;
}
map->btf_key_type_id = attr->btf_key_type_id;
map->btf_value_type_id = attr->btf_value_type_id;
map->btf_vmlinux_value_type_id =
attr->btf_vmlinux_value_type_id;
}
err = security_bpf_map_alloc(map);
if (err)
goto free_map;
err = bpf_map_alloc_id(map);
if (err)
goto free_map_sec;
bpf_map_save_memcg(map);
err = bpf_map_new_fd(map, f_flags);
if (err < 0) {
/* failed to allocate fd.
* bpf_map_put_with_uref() is needed because the above
* bpf_map_alloc_id() has published the map
* to the userspace and the userspace may
* have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
*/
bpf_map_put_with_uref(map);
return err;
}
return err;
free_map_sec:
security_bpf_map_free(map);
free_map:
btf_put(map->btf);
map->ops->map_free(map);
return err;
}
/* if error is returned, fd is released.
* On success caller should complete fd access with matching fdput()
*/
struct bpf_map *__bpf_map_get(struct fd f)
{
if (!f.file)
return ERR_PTR(-EBADF);
if (f.file->f_op != &bpf_map_fops) {
fdput(f);
return ERR_PTR(-EINVAL);
}
return f.file->private_data;
}
void bpf_map_inc(struct bpf_map *map)
{
atomic64_inc(&map->refcnt);
}
EXPORT_SYMBOL_GPL(bpf_map_inc);
void bpf_map_inc_with_uref(struct bpf_map *map)
{
atomic64_inc(&map->refcnt);
atomic64_inc(&map->usercnt);
}
EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref);
struct bpf_map *bpf_map_get(u32 ufd)
{
struct fd f = fdget(ufd);
struct bpf_map *map;
map = __bpf_map_get(f);
if (IS_ERR(map))
return map;
bpf_map_inc(map);
fdput(f);
return map;
}
struct bpf_map *bpf_map_get_with_uref(u32 ufd)
{
struct fd f = fdget(ufd);
struct bpf_map *map;
map = __bpf_map_get(f);
if (IS_ERR(map))
return map;
bpf_map_inc_with_uref(map);
fdput(f);
return map;
}
/* map_idr_lock should have been held */
static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
{
int refold;
refold = atomic64_fetch_add_unless(&map->refcnt, 1, 0);
if (!refold)
return ERR_PTR(-ENOENT);
if (uref)
atomic64_inc(&map->usercnt);
return map;
}
struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map)
{
spin_lock_bh(&map_idr_lock);
map = __bpf_map_inc_not_zero(map, false);
spin_unlock_bh(&map_idr_lock);
return map;
}
EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero);
int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
{
return -ENOTSUPP;
}
static void *__bpf_copy_key(void __user *ukey, u64 key_size)
{
if (key_size)
return vmemdup_user(ukey, key_size);
if (ukey)
return ERR_PTR(-EINVAL);
return NULL;
}
static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size)
{
if (key_size)
return kvmemdup_bpfptr(ukey, key_size);
if (!bpfptr_is_null(ukey))
return ERR_PTR(-EINVAL);
return NULL;
}
/* last field in 'union bpf_attr' used by this command */
#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags
static int map_lookup_elem(union bpf_attr *attr)
{
void __user *ukey = u64_to_user_ptr(attr->key);
void __user *uvalue = u64_to_user_ptr(attr->value);
int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *value;
u32 value_size;
struct fd f;
int err;
if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
return -EINVAL;
if (attr->flags & ~BPF_F_LOCK)
return -EINVAL;
f = fdget(ufd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
err = -EPERM;
goto err_put;
}
if ((attr->flags & BPF_F_LOCK) &&
!map_value_has_spin_lock(map)) {
err = -EINVAL;
goto err_put;
}
key = __bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
goto err_put;
}
value_size = bpf_map_value_size(map);
err = -ENOMEM;
value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
if (!value)
goto free_key;
err = bpf_map_copy_value(map, key, value, attr->flags);
if (err)
goto free_value;
err = -EFAULT;
if (copy_to_user(uvalue, value, value_size) != 0)
goto free_value;
err = 0;
free_value:
kvfree(value);
free_key:
kvfree(key);
err_put:
fdput(f);
return err;
}
#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
{
bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel);
int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *value;
u32 value_size;
struct fd f;
int err;
if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
return -EINVAL;
f = fdget(ufd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
bpf_map_write_active_inc(map);
if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
goto err_put;
}
if ((attr->flags & BPF_F_LOCK) &&
!map_value_has_spin_lock(map)) {
err = -EINVAL;
goto err_put;
}
key = ___bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
goto err_put;
}
value_size = bpf_map_value_size(map);
err = -ENOMEM;
value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
if (!value)
goto free_key;
err = -EFAULT;
if (copy_from_bpfptr(value, uvalue, value_size) != 0)
goto free_value;
err = bpf_map_update_value(map, f, key, value, attr->flags);
free_value:
kvfree(value);
free_key:
kvfree(key);
err_put:
bpf_map_write_active_dec(map);
fdput(f);
return err;
}
#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
static int map_delete_elem(union bpf_attr *attr)
{
void __user *ukey = u64_to_user_ptr(attr->key);
int ufd = attr->map_fd;
struct bpf_map *map;
struct fd f;
void *key;
int err;
if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
return -EINVAL;
f = fdget(ufd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
bpf_map_write_active_inc(map);
if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
goto err_put;
}
key = __bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
goto err_put;
}
if (bpf_map_is_dev_bound(map)) {
err = bpf_map_offload_delete_elem(map, key);
goto out;
} else if (IS_FD_PROG_ARRAY(map) ||
map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
/* These maps require sleepable context */
err = map->ops->map_delete_elem(map, key);
goto out;
}
bpf_disable_instrumentation();
rcu_read_lock();
err = map->ops->map_delete_elem(map, key);
rcu_read_unlock();
bpf_enable_instrumentation();
maybe_wait_bpf_programs(map);
out:
kvfree(key);
err_put:
bpf_map_write_active_dec(map);
fdput(f);
return err;
}
/* last field in 'union bpf_attr' used by this command */
#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
static int map_get_next_key(union bpf_attr *attr)
{
void __user *ukey = u64_to_user_ptr(attr->key);
void __user *unext_key = u64_to_user_ptr(attr->next_key);
int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *next_key;
struct fd f;
int err;
if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
return -EINVAL;
f = fdget(ufd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
err = -EPERM;
goto err_put;
}
if (ukey) {
key = __bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
goto err_put;
}
} else {
key = NULL;
}
err = -ENOMEM;
next_key = kvmalloc(map->key_size, GFP_USER);
if (!next_key)
goto free_key;
if (bpf_map_is_dev_bound(map)) {
err = bpf_map_offload_get_next_key(map, key, next_key);
goto out;
}
rcu_read_lock();
err = map->ops->map_get_next_key(map, key, next_key);
rcu_read_unlock();
out:
if (err)
goto free_next_key;
err = -EFAULT;
if (copy_to_user(unext_key, next_key, map->key_size) != 0)
goto free_next_key;
err = 0;
free_next_key:
kvfree(next_key);
free_key:
kvfree(key);
err_put:
fdput(f);
return err;
}
int generic_map_delete_batch(struct bpf_map *map,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
void __user *keys = u64_to_user_ptr(attr->batch.keys);
u32 cp, max_count;
int err = 0;
void *key;
if (attr->batch.elem_flags & ~BPF_F_LOCK)
return -EINVAL;
if ((attr->batch.elem_flags & BPF_F_LOCK) &&
!map_value_has_spin_lock(map)) {
return -EINVAL;
}
max_count = attr->batch.count;
if (!max_count)
return 0;
key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
if (!key)
return -ENOMEM;
for (cp = 0; cp < max_count; cp++) {
err = -EFAULT;
if (copy_from_user(key, keys + cp * map->key_size,
map->key_size))
break;
if (bpf_map_is_dev_bound(map)) {
err = bpf_map_offload_delete_elem(map, key);
break;
}
bpf_disable_instrumentation();
rcu_read_lock();
err = map->ops->map_delete_elem(map, key);
rcu_read_unlock();
bpf_enable_instrumentation();
maybe_wait_bpf_programs(map);
if (err)
break;
cond_resched();
}
if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
err = -EFAULT;
kvfree(key);
return err;
}
int generic_map_update_batch(struct bpf_map *map,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
void __user *values = u64_to_user_ptr(attr->batch.values);
void __user *keys = u64_to_user_ptr(attr->batch.keys);
u32 value_size, cp, max_count;
int ufd = attr->batch.map_fd;
void *key, *value;
struct fd f;
int err = 0;
if (attr->batch.elem_flags & ~BPF_F_LOCK)
return -EINVAL;
if ((attr->batch.elem_flags & BPF_F_LOCK) &&
!map_value_has_spin_lock(map)) {
return -EINVAL;
}
value_size = bpf_map_value_size(map);
max_count = attr->batch.count;
if (!max_count)
return 0;
key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
if (!key)
return -ENOMEM;
value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
if (!value) {
kvfree(key);
return -ENOMEM;
}
f = fdget(ufd); /* bpf_map_do_batch() guarantees ufd is valid */
for (cp = 0; cp < max_count; cp++) {
err = -EFAULT;
if (copy_from_user(key, keys + cp * map->key_size,
map->key_size) ||
copy_from_user(value, values + cp * value_size, value_size))
break;
err = bpf_map_update_value(map, f, key, value,
attr->batch.elem_flags);
if (err)
break;
cond_resched();
}
if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
err = -EFAULT;
kvfree(value);
kvfree(key);
fdput(f);
return err;
}
#define MAP_LOOKUP_RETRIES 3
int generic_map_lookup_batch(struct bpf_map *map,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch);
void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
void __user *values = u64_to_user_ptr(attr->batch.values);
void __user *keys = u64_to_user_ptr(attr->batch.keys);
void *buf, *buf_prevkey, *prev_key, *key, *value;
int err, retry = MAP_LOOKUP_RETRIES;
u32 value_size, cp, max_count;
if (attr->batch.elem_flags & ~BPF_F_LOCK)
return -EINVAL;
if ((attr->batch.elem_flags & BPF_F_LOCK) &&
!map_value_has_spin_lock(map))
return -EINVAL;
value_size = bpf_map_value_size(map);
max_count = attr->batch.count;
if (!max_count)
return 0;
if (put_user(0, &uattr->batch.count))
return -EFAULT;
buf_prevkey = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
if (!buf_prevkey)
return -ENOMEM;
buf = kvmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
if (!buf) {
kvfree(buf_prevkey);
return -ENOMEM;
}
err = -EFAULT;
prev_key = NULL;
if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size))
goto free_buf;
key = buf;
value = key + map->key_size;
if (ubatch)
prev_key = buf_prevkey;
for (cp = 0; cp < max_count;) {
rcu_read_lock();
err = map->ops->map_get_next_key(map, prev_key, key);
rcu_read_unlock();
if (err)
break;
err = bpf_map_copy_value(map, key, value,
attr->batch.elem_flags);
if (err == -ENOENT) {
if (retry) {
retry--;
continue;
}
err = -EINTR;
break;
}
if (err)
goto free_buf;
if (copy_to_user(keys + cp * map->key_size, key,
map->key_size)) {
err = -EFAULT;
goto free_buf;
}
if (copy_to_user(values + cp * value_size, value, value_size)) {
err = -EFAULT;
goto free_buf;
}
if (!prev_key)
prev_key = buf_prevkey;
swap(prev_key, key);
retry = MAP_LOOKUP_RETRIES;
cp++;
cond_resched();
}
if (err == -EFAULT)
goto free_buf;
if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) ||
(cp && copy_to_user(uobatch, prev_key, map->key_size))))
err = -EFAULT;
free_buf:
kvfree(buf_prevkey);
kvfree(buf);
return err;
}
#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD flags
static int map_lookup_and_delete_elem(union bpf_attr *attr)
{
void __user *ukey = u64_to_user_ptr(attr->key);
void __user *uvalue = u64_to_user_ptr(attr->value);
int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *value;
u32 value_size;
struct fd f;
int err;
if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
return -EINVAL;
if (attr->flags & ~BPF_F_LOCK)
return -EINVAL;
f = fdget(ufd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
bpf_map_write_active_inc(map);
if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) ||
!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
goto err_put;
}
if (attr->flags &&
(map->map_type == BPF_MAP_TYPE_QUEUE ||
map->map_type == BPF_MAP_TYPE_STACK)) {
err = -EINVAL;
goto err_put;
}
if ((attr->flags & BPF_F_LOCK) &&
!map_value_has_spin_lock(map)) {
err = -EINVAL;
goto err_put;
}
key = __bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
goto err_put;
}
value_size = bpf_map_value_size(map);
err = -ENOMEM;
value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
if (!value)
goto free_key;
err = -ENOTSUPP;
if (map->map_type == BPF_MAP_TYPE_QUEUE ||
map->map_type == BPF_MAP_TYPE_STACK) {
err = map->ops->map_pop_elem(map, value);
} else if (map->map_type == BPF_MAP_TYPE_HASH ||
map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
if (!bpf_map_is_dev_bound(map)) {
bpf_disable_instrumentation();
rcu_read_lock();
err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags);
rcu_read_unlock();
bpf_enable_instrumentation();
}
}
if (err)
goto free_value;
if (copy_to_user(uvalue, value, value_size) != 0) {
err = -EFAULT;
goto free_value;
}
err = 0;
free_value:
kvfree(value);
free_key:
kvfree(key);
err_put:
bpf_map_write_active_dec(map);
fdput(f);
return err;
}
#define BPF_MAP_FREEZE_LAST_FIELD map_fd
static int map_freeze(const union bpf_attr *attr)
{
int err = 0, ufd = attr->map_fd;
struct bpf_map *map;
struct fd f;
if (CHECK_ATTR(BPF_MAP_FREEZE))
return -EINVAL;
f = fdget(ufd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS ||
map_value_has_timer(map)) {
fdput(f);
return -ENOTSUPP;
}
mutex_lock(&map->freeze_mutex);
if (bpf_map_write_active(map)) {
err = -EBUSY;
goto err_put;
}
if (READ_ONCE(map->frozen)) {
err = -EBUSY;
goto err_put;
}
if (!bpf_capable()) {
err = -EPERM;
goto err_put;
}
WRITE_ONCE(map->frozen, true);
err_put:
mutex_unlock(&map->freeze_mutex);
fdput(f);
return err;
}
static const struct bpf_prog_ops * const bpf_prog_types[] = {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
[_id] = & _name ## _prog_ops,
#define BPF_MAP_TYPE(_id, _ops)
#define BPF_LINK_TYPE(_id, _name)
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
#undef BPF_LINK_TYPE
};
static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
{
const struct bpf_prog_ops *ops;
if (type >= ARRAY_SIZE(bpf_prog_types))
return -EINVAL;
type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types));
ops = bpf_prog_types[type];
if (!ops)
return -EINVAL;
if (!bpf_prog_is_dev_bound(prog->aux))
prog->aux->ops = ops;
else
prog->aux->ops = &bpf_offload_prog_ops;
prog->type = type;
return 0;
}
enum bpf_audit {
BPF_AUDIT_LOAD,
BPF_AUDIT_UNLOAD,
BPF_AUDIT_MAX,
};
static const char * const bpf_audit_str[BPF_AUDIT_MAX] = {
[BPF_AUDIT_LOAD] = "LOAD",
[BPF_AUDIT_UNLOAD] = "UNLOAD",
};
static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)
{
struct audit_context *ctx = NULL;
struct audit_buffer *ab;
if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX))
return;
if (audit_enabled == AUDIT_OFF)
return;
if (op == BPF_AUDIT_LOAD)
ctx = audit_context();
ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
if (unlikely(!ab))
return;
audit_log_format(ab, "prog-id=%u op=%s",
prog->aux->id, bpf_audit_str[op]);
audit_log_end(ab);
}
static int bpf_prog_alloc_id(struct bpf_prog *prog)
{
int id;
idr_preload(GFP_KERNEL);
spin_lock_bh(&prog_idr_lock);
id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC);
if (id > 0)
prog->aux->id = id;
spin_unlock_bh(&prog_idr_lock);
idr_preload_end();
/* id is in [1, INT_MAX) */
if (WARN_ON_ONCE(!id))
return -ENOSPC;
return id > 0 ? 0 : id;
}
void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
{
unsigned long flags;
/* cBPF to eBPF migrations are currently not in the idr store.
* Offloaded programs are removed from the store when their device
* disappears - even if someone grabs an fd to them they are unusable,
* simply waiting for refcnt to drop to be freed.
*/
if (!prog->aux->id)
return;
if (do_idr_lock)
spin_lock_irqsave(&prog_idr_lock, flags);
else
__acquire(&prog_idr_lock);
idr_remove(&prog_idr, prog->aux->id);
prog->aux->id = 0;
if (do_idr_lock)
spin_unlock_irqrestore(&prog_idr_lock, flags);
else
__release(&prog_idr_lock);
}
static void __bpf_prog_put_rcu(struct rcu_head *rcu)
{
struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
kvfree(aux->func_info);
kfree(aux->func_info_aux);
free_uid(aux->user);
security_bpf_prog_free(aux);
bpf_prog_free(aux->prog);
}
static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
{
bpf_prog_kallsyms_del_all(prog);
btf_put(prog->aux->btf);
kvfree(prog->aux->jited_linfo);
kvfree(prog->aux->linfo);
kfree(prog->aux->kfunc_tab);
if (prog->aux->attach_btf)
btf_put(prog->aux->attach_btf);
if (deferred) {
if (prog->aux->sleepable)
call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu);
else
call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
} else {
__bpf_prog_put_rcu(&prog->aux->rcu);
}
}
static void bpf_prog_put_deferred(struct work_struct *work)
{
struct bpf_prog_aux *aux;
struct bpf_prog *prog;
aux = container_of(work, struct bpf_prog_aux, work);
prog = aux->prog;
perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
__bpf_prog_put_noref(prog, true);
}
static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
{
struct bpf_prog_aux *aux = prog->aux;
if (atomic64_dec_and_test(&aux->refcnt)) {
/* bpf_prog_free_id() must be called first */
bpf_prog_free_id(prog, do_idr_lock);
if (in_irq() || irqs_disabled()) {
INIT_WORK(&aux->work, bpf_prog_put_deferred);
schedule_work(&aux->work);
} else {
bpf_prog_put_deferred(&aux->work);
}
}
}
void bpf_prog_put(struct bpf_prog *prog)
{
__bpf_prog_put(prog, true);
}
EXPORT_SYMBOL_GPL(bpf_prog_put);
static int bpf_prog_release(struct inode *inode, struct file *filp)
{
struct bpf_prog *prog = filp->private_data;
bpf_prog_put(prog);
return 0;
}
struct bpf_prog_kstats {
u64 nsecs;
u64 cnt;
u64 misses;
};
static void bpf_prog_get_stats(const struct bpf_prog *prog,
struct bpf_prog_kstats *stats)
{
u64 nsecs = 0, cnt = 0, misses = 0;
int cpu;
for_each_possible_cpu(cpu) {
const struct bpf_prog_stats *st;
unsigned int start;
u64 tnsecs, tcnt, tmisses;
st = per_cpu_ptr(prog->stats, cpu);
do {
start = u64_stats_fetch_begin_irq(&st->syncp);
tnsecs = u64_stats_read(&st->nsecs);
tcnt = u64_stats_read(&st->cnt);
tmisses = u64_stats_read(&st->misses);
} while (u64_stats_fetch_retry_irq(&st->syncp, start));
nsecs += tnsecs;
cnt += tcnt;
misses += tmisses;
}
stats->nsecs = nsecs;
stats->cnt = cnt;
stats->misses = misses;
}
#ifdef CONFIG_PROC_FS
static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct bpf_prog *prog = filp->private_data;
char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
struct bpf_prog_kstats stats;
bpf_prog_get_stats(prog, &stats);
bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
seq_printf(m,
"prog_type:\t%u\n"
"prog_jited:\t%u\n"
"prog_tag:\t%s\n"
"memlock:\t%llu\n"
"prog_id:\t%u\n"
"run_time_ns:\t%llu\n"
"run_cnt:\t%llu\n"
"recursion_misses:\t%llu\n",
prog->type,
prog->jited,
prog_tag,
prog->pages * 1ULL << PAGE_SHIFT,
prog->aux->id,
stats.nsecs,
stats.cnt,
stats.misses);
}
#endif
const struct file_operations bpf_prog_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_prog_show_fdinfo,
#endif
.release = bpf_prog_release,
.read = bpf_dummy_read,
.write = bpf_dummy_write,
};
int bpf_prog_new_fd(struct bpf_prog *prog)
{
int ret;
ret = security_bpf_prog(prog);
if (ret < 0)
return ret;
return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
O_RDWR | O_CLOEXEC);
}
static struct bpf_prog *____bpf_prog_get(struct fd f)
{
if (!f.file)
return ERR_PTR(-EBADF);
if (f.file->f_op != &bpf_prog_fops) {
fdput(f);
return ERR_PTR(-EINVAL);
}
return f.file->private_data;
}
void bpf_prog_add(struct bpf_prog *prog, int i)
{
atomic64_add(i, &prog->aux->refcnt);
}
EXPORT_SYMBOL_GPL(bpf_prog_add);
void bpf_prog_sub(struct bpf_prog *prog, int i)
{
/* Only to be used for undoing previous bpf_prog_add() in some
* error path. We still know that another entity in our call
* path holds a reference to the program, thus atomic_sub() can
* be safely used in such cases!
*/
WARN_ON(atomic64_sub_return(i, &prog->aux->refcnt) == 0);
}
EXPORT_SYMBOL_GPL(bpf_prog_sub);
void bpf_prog_inc(struct bpf_prog *prog)
{
atomic64_inc(&prog->aux->refcnt);
}
EXPORT_SYMBOL_GPL(bpf_prog_inc);
/* prog_idr_lock should have been held */
struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
{
int refold;
refold = atomic64_fetch_add_unless(&prog->aux->refcnt, 1, 0);
if (!refold)
return ERR_PTR(-ENOENT);
return prog;
}
EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
bool bpf_prog_get_ok(struct bpf_prog *prog,
enum bpf_prog_type *attach_type, bool attach_drv)
{
/* not an attachment, just a refcount inc, always allow */
if (!attach_type)
return true;
if (prog->type != *attach_type)
return false;
if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv)
return false;
return true;
}
static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
bool attach_drv)
{
struct fd f = fdget(ufd);
struct bpf_prog *prog;
prog = ____bpf_prog_get(f);
if (IS_ERR(prog))
return prog;
if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) {
prog = ERR_PTR(-EINVAL);
goto out;
}
bpf_prog_inc(prog);
out:
fdput(f);
return prog;
}
struct bpf_prog *bpf_prog_get(u32 ufd)
{
return __bpf_prog_get(ufd, NULL, false);
}
struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
bool attach_drv)
{
return __bpf_prog_get(ufd, &type, attach_drv);
}
EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
/* Initially all BPF programs could be loaded w/o specifying
* expected_attach_type. Later for some of them specifying expected_attach_type
* at load time became required so that program could be validated properly.
* Programs of types that are allowed to be loaded both w/ and w/o (for
* backward compatibility) expected_attach_type, should have the default attach
* type assigned to expected_attach_type for the latter case, so that it can be
* validated later at attach time.
*
* bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if
* prog type requires it but has some attach types that have to be backward
* compatible.
*/
static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
{
switch (attr->prog_type) {
case BPF_PROG_TYPE_CGROUP_SOCK:
/* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't
* exist so checking for non-zero is the way to go here.
*/
if (!attr->expected_attach_type)
attr->expected_attach_type =
BPF_CGROUP_INET_SOCK_CREATE;
break;
case BPF_PROG_TYPE_SK_REUSEPORT:
if (!attr->expected_attach_type)
attr->expected_attach_type =
BPF_SK_REUSEPORT_SELECT;
break;
}
}
static int
bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
enum bpf_attach_type expected_attach_type,
struct btf *attach_btf, u32 btf_id,
struct bpf_prog *dst_prog)
{
if (btf_id) {
if (btf_id > BTF_MAX_TYPE)
return -EINVAL;
if (!attach_btf && !dst_prog)
return -EINVAL;
switch (prog_type) {
case BPF_PROG_TYPE_TRACING:
case BPF_PROG_TYPE_LSM:
case BPF_PROG_TYPE_STRUCT_OPS:
case BPF_PROG_TYPE_EXT:
break;
default:
return -EINVAL;
}
}
if (attach_btf && (!btf_id || dst_prog))
return -EINVAL;
if (dst_prog && prog_type != BPF_PROG_TYPE_TRACING &&
prog_type != BPF_PROG_TYPE_EXT)
return -EINVAL;
switch (prog_type) {
case BPF_PROG_TYPE_CGROUP_SOCK:
switch (expected_attach_type) {
case BPF_CGROUP_INET_SOCK_CREATE:
case BPF_CGROUP_INET_SOCK_RELEASE:
case BPF_CGROUP_INET4_POST_BIND:
case BPF_CGROUP_INET6_POST_BIND:
return 0;
default:
return -EINVAL;
}
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
switch (expected_attach_type) {
case BPF_CGROUP_INET4_BIND:
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
case BPF_CGROUP_INET4_GETPEERNAME:
case BPF_CGROUP_INET6_GETPEERNAME:
case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_INET6_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
case BPF_CGROUP_UDP6_RECVMSG:
return 0;
default:
return -EINVAL;
}
case BPF_PROG_TYPE_CGROUP_SKB:
switch (expected_attach_type) {
case BPF_CGROUP_INET_INGRESS:
case BPF_CGROUP_INET_EGRESS:
return 0;
default:
return -EINVAL;
}
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
switch (expected_attach_type) {
case BPF_CGROUP_SETSOCKOPT:
case BPF_CGROUP_GETSOCKOPT:
return 0;
default:
return -EINVAL;
}
case BPF_PROG_TYPE_SK_LOOKUP:
if (expected_attach_type == BPF_SK_LOOKUP)
return 0;
return -EINVAL;
case BPF_PROG_TYPE_SK_REUSEPORT:
switch (expected_attach_type) {
case BPF_SK_REUSEPORT_SELECT:
case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:
return 0;
default:
return -EINVAL;
}
case BPF_PROG_TYPE_SYSCALL:
case BPF_PROG_TYPE_EXT:
if (expected_attach_type)
return -EINVAL;
fallthrough;
default:
return 0;
}
}
static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
{
switch (prog_type) {
case BPF_PROG_TYPE_SCHED_CLS:
case BPF_PROG_TYPE_SCHED_ACT:
case BPF_PROG_TYPE_XDP:
case BPF_PROG_TYPE_LWT_IN:
case BPF_PROG_TYPE_LWT_OUT:
case BPF_PROG_TYPE_LWT_XMIT:
case BPF_PROG_TYPE_LWT_SEG6LOCAL:
case BPF_PROG_TYPE_SK_SKB:
case BPF_PROG_TYPE_SK_MSG:
case BPF_PROG_TYPE_LIRC_MODE2:
case BPF_PROG_TYPE_FLOW_DISSECTOR:
case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SOCK:
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
case BPF_PROG_TYPE_SOCK_OPS:
case BPF_PROG_TYPE_EXT: /* extends any prog */
return true;
case BPF_PROG_TYPE_CGROUP_SKB:
/* always unpriv */
case BPF_PROG_TYPE_SK_REUSEPORT:
/* equivalent to SOCKET_FILTER. need CAP_BPF only */
default:
return false;
}
}
static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
{
switch (prog_type) {
case BPF_PROG_TYPE_KPROBE:
case BPF_PROG_TYPE_TRACEPOINT:
case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_RAW_TRACEPOINT:
case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
case BPF_PROG_TYPE_TRACING:
case BPF_PROG_TYPE_LSM:
case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */
case BPF_PROG_TYPE_EXT: /* extends any prog */
return true;
default:
return false;
}
}
/* last field in 'union bpf_attr' used by this command */
#define BPF_PROG_LOAD_LAST_FIELD fd_array
static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
{
enum bpf_prog_type type = attr->prog_type;
struct bpf_prog *prog, *dst_prog = NULL;
struct btf *attach_btf = NULL;
int err;
char license[128];
bool is_gpl;
if (CHECK_ATTR(BPF_PROG_LOAD))
return -EINVAL;
if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
BPF_F_ANY_ALIGNMENT |
BPF_F_TEST_STATE_FREQ |
BPF_F_SLEEPABLE |
BPF_F_TEST_RND_HI32))
return -EINVAL;
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
(attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
!bpf_capable())
return -EPERM;
/* copy eBPF program license from user space */
if (strncpy_from_bpfptr(license,
make_bpfptr(attr->license, uattr.is_kernel),
sizeof(license) - 1) < 0)
return -EFAULT;
license[sizeof(license) - 1] = 0;
/* eBPF programs must be GPL compatible to use GPL-ed functions */
is_gpl = license_is_gpl_compatible(license);
if (attr->insn_cnt == 0 ||
attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
return -E2BIG;
if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
type != BPF_PROG_TYPE_CGROUP_SKB &&
!bpf_capable())
return -EPERM;
if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN))
return -EPERM;
if (is_perfmon_prog_type(type) && !perfmon_capable())
return -EPERM;
/* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
* or btf, we need to check which one it is
*/
if (attr->attach_prog_fd) {
dst_prog = bpf_prog_get(attr->attach_prog_fd);
if (IS_ERR(dst_prog)) {
dst_prog = NULL;
attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd);
if (IS_ERR(attach_btf))
return -EINVAL;
if (!btf_is_kernel(attach_btf)) {
/* attaching through specifying bpf_prog's BTF
* objects directly might be supported eventually
*/
btf_put(attach_btf);
return -ENOTSUPP;
}
}
} else if (attr->attach_btf_id) {
/* fall back to vmlinux BTF, if BTF type ID is specified */
attach_btf = bpf_get_btf_vmlinux();
if (IS_ERR(attach_btf))
return PTR_ERR(attach_btf);
if (!attach_btf)
return -EINVAL;
btf_get(attach_btf);
}
bpf_prog_load_fixup_attach_type(attr);
if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
attach_btf, attr->attach_btf_id,
dst_prog)) {
if (dst_prog)
bpf_prog_put(dst_prog);
if (attach_btf)
btf_put(attach_btf);
return -EINVAL;
}
/* plain bpf_prog allocation */
prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
if (!prog) {
if (dst_prog)
bpf_prog_put(dst_prog);
if (attach_btf)
btf_put(attach_btf);
return -ENOMEM;
}
prog->expected_attach_type = attr->expected_attach_type;
prog->aux->attach_btf = attach_btf;
prog->aux->attach_btf_id = attr->attach_btf_id;
prog->aux->dst_prog = dst_prog;
prog->aux->offload_requested = !!attr->prog_ifindex;
prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
err = security_bpf_prog_alloc(prog->aux);
if (err)
goto free_prog;
prog->aux->user = get_current_user();
prog->len = attr->insn_cnt;
err = -EFAULT;
if (copy_from_bpfptr(prog->insns,
make_bpfptr(attr->insns, uattr.is_kernel),
bpf_prog_insn_size(prog)) != 0)
goto free_prog_sec;
prog->orig_prog = NULL;
prog->jited = 0;
atomic64_set(&prog->aux->refcnt, 1);
prog->gpl_compatible = is_gpl ? 1 : 0;
if (bpf_prog_is_dev_bound(prog->aux)) {
err = bpf_prog_offload_init(prog, attr);
if (err)
goto free_prog_sec;
}
/* find program type: socket_filter vs tracing_filter */
err = find_prog_type(type, prog);
if (err < 0)
goto free_prog_sec;
prog->aux->load_time = ktime_get_boottime_ns();
err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
sizeof(attr->prog_name));
if (err < 0)
goto free_prog_sec;
/* run eBPF verifier */
err = bpf_check(&prog, attr, uattr);
if (err < 0)
goto free_used_maps;
prog = bpf_prog_select_runtime(prog, &err);
if (err < 0)
goto free_used_maps;
err = bpf_prog_alloc_id(prog);
if (err)
goto free_used_maps;
/* Upon success of bpf_prog_alloc_id(), the BPF prog is
* effectively publicly exposed. However, retrieving via
* bpf_prog_get_fd_by_id() will take another reference,
* therefore it cannot be gone underneath us.
*
* Only for the time /after/ successful bpf_prog_new_fd()
* and before returning to userspace, we might just hold
* one reference and any parallel close on that fd could
* rip everything out. Hence, below notifications must
* happen before bpf_prog_new_fd().
*
* Also, any failure handling from this point onwards must
* be using bpf_prog_put() given the program is exposed.
*/
bpf_prog_kallsyms_add(prog);
perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
bpf_audit_prog(prog, BPF_AUDIT_LOAD);
err = bpf_prog_new_fd(prog);
if (err < 0)
bpf_prog_put(prog);
return err;
free_used_maps:
/* In case we have subprogs, we need to wait for a grace
* period before we can tear down JIT memory since symbols
* are already exposed under kallsyms.
*/
__bpf_prog_put_noref(prog, prog->aux->func_cnt);
return err;
free_prog_sec:
free_uid(prog->aux->user);
security_bpf_prog_free(prog->aux);
free_prog:
if (prog->aux->attach_btf)
btf_put(prog->aux->attach_btf);
bpf_prog_free(prog);
return err;
}
#define BPF_OBJ_LAST_FIELD file_flags
static int bpf_obj_pin(const union bpf_attr *attr)
{
if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
return -EINVAL;
return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
}
static int bpf_obj_get(const union bpf_attr *attr)
{
if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
attr->file_flags & ~BPF_OBJ_FLAG_MASK)
return -EINVAL;
return bpf_obj_get_user(u64_to_user_ptr(attr->pathname),
attr->file_flags);
}
void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
const struct bpf_link_ops *ops, struct bpf_prog *prog)
{
atomic64_set(&link->refcnt, 1);
link->type = type;
link->id = 0;
link->ops = ops;
link->prog = prog;
}
static void bpf_link_free_id(int id)
{
if (!id)
return;
spin_lock_bh(&link_idr_lock);
idr_remove(&link_idr, id);
spin_unlock_bh(&link_idr_lock);
}
/* Clean up bpf_link and corresponding anon_inode file and FD. After
* anon_inode is created, bpf_link can't be just kfree()'d due to deferred
* anon_inode's release() call. This helper marksbpf_link as
* defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt
* is not decremented, it's the responsibility of a calling code that failed
* to complete bpf_link initialization.
*/
void bpf_link_cleanup(struct bpf_link_primer *primer)
{
primer->link->prog = NULL;
bpf_link_free_id(primer->id);
fput(primer->file);
put_unused_fd(primer->fd);
}
void bpf_link_inc(struct bpf_link *link)
{
atomic64_inc(&link->refcnt);
}
/* bpf_link_free is guaranteed to be called from process context */
static void bpf_link_free(struct bpf_link *link)
{
bpf_link_free_id(link->id);
if (link->prog) {
/* detach BPF program, clean up used resources */
link->ops->release(link);
bpf_prog_put(link->prog);
}
/* free bpf_link and its containing memory */
link->ops->dealloc(link);
}
static void bpf_link_put_deferred(struct work_struct *work)
{
struct bpf_link *link = container_of(work, struct bpf_link, work);
bpf_link_free(link);
}
/* bpf_link_put can be called from atomic context, but ensures that resources
* are freed from process context
*/
void bpf_link_put(struct bpf_link *link)
{
if (!atomic64_dec_and_test(&link->refcnt))
return;
if (in_atomic()) {
INIT_WORK(&link->work, bpf_link_put_deferred);
schedule_work(&link->work);
} else {
bpf_link_free(link);
}
}
static int bpf_link_release(struct inode *inode, struct file *filp)
{
struct bpf_link *link = filp->private_data;
bpf_link_put(link);
return 0;
}
#ifdef CONFIG_PROC_FS
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
#define BPF_MAP_TYPE(_id, _ops)
#define BPF_LINK_TYPE(_id, _name) [_id] = #_name,
static const char *bpf_link_type_strs[] = {
[BPF_LINK_TYPE_UNSPEC] = "<invalid>",
#include <linux/bpf_types.h>
};
#undef BPF_PROG_TYPE
#undef BPF_MAP_TYPE
#undef BPF_LINK_TYPE
static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct bpf_link *link = filp->private_data;
const struct bpf_prog *prog = link->prog;
char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
seq_printf(m,
"link_type:\t%s\n"
"link_id:\t%u\n"
"prog_tag:\t%s\n"
"prog_id:\t%u\n",
bpf_link_type_strs[link->type],
link->id,
prog_tag,
prog->aux->id);
if (link->ops->show_fdinfo)
link->ops->show_fdinfo(link, m);
}
#endif
static const struct file_operations bpf_link_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_link_show_fdinfo,
#endif
.release = bpf_link_release,
.read = bpf_dummy_read,
.write = bpf_dummy_write,
};
static int bpf_link_alloc_id(struct bpf_link *link)
{
int id;
idr_preload(GFP_KERNEL);
spin_lock_bh(&link_idr_lock);
id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC);
spin_unlock_bh(&link_idr_lock);
idr_preload_end();
return id;
}
/* Prepare bpf_link to be exposed to user-space by allocating anon_inode file,
* reserving unused FD and allocating ID from link_idr. This is to be paired
* with bpf_link_settle() to install FD and ID and expose bpf_link to
* user-space, if bpf_link is successfully attached. If not, bpf_link and
* pre-allocated resources are to be freed with bpf_cleanup() call. All the
* transient state is passed around in struct bpf_link_primer.
* This is preferred way to create and initialize bpf_link, especially when
* there are complicated and expensive operations inbetween creating bpf_link
* itself and attaching it to BPF hook. By using bpf_link_prime() and
* bpf_link_settle() kernel code using bpf_link doesn't have to perform
* expensive (and potentially failing) roll back operations in a rare case
* that file, FD, or ID can't be allocated.
*/
int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer)
{
struct file *file;
int fd, id;
fd = get_unused_fd_flags(O_CLOEXEC);
if (fd < 0)
return fd;
id = bpf_link_alloc_id(link);
if (id < 0) {
put_unused_fd(fd);
return id;
}
file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC);
if (IS_ERR(file)) {
bpf_link_free_id(id);
put_unused_fd(fd);
return PTR_ERR(file);
}
primer->link = link;
primer->file = file;
primer->fd = fd;
primer->id = id;
return 0;
}
int bpf_link_settle(struct bpf_link_primer *primer)
{
/* make bpf_link fetchable by ID */
spin_lock_bh(&link_idr_lock);
primer->link->id = primer->id;
spin_unlock_bh(&link_idr_lock);
/* make bpf_link fetchable by FD */
fd_install(primer->fd, primer->file);
/* pass through installed FD */
return primer->fd;
}
int bpf_link_new_fd(struct bpf_link *link)
{
return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
}
struct bpf_link *bpf_link_get_from_fd(u32 ufd)
{
struct fd f = fdget(ufd);
struct bpf_link *link;
if (!f.file)
return ERR_PTR(-EBADF);
if (f.file->f_op != &bpf_link_fops) {
fdput(f);
return ERR_PTR(-EINVAL);
}
link = f.file->private_data;
bpf_link_inc(link);
fdput(f);
return link;
}
struct bpf_tracing_link {
struct bpf_link link;
enum bpf_attach_type attach_type;
struct bpf_trampoline *trampoline;
struct bpf_prog *tgt_prog;
};
static void bpf_tracing_link_release(struct bpf_link *link)
{
struct bpf_tracing_link *tr_link =
container_of(link, struct bpf_tracing_link, link);
WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog,
tr_link->trampoline));
bpf_trampoline_put(tr_link->trampoline);
/* tgt_prog is NULL if target is a kernel function */
if (tr_link->tgt_prog)
bpf_prog_put(tr_link->tgt_prog);
}
static void bpf_tracing_link_dealloc(struct bpf_link *link)
{
struct bpf_tracing_link *tr_link =
container_of(link, struct bpf_tracing_link, link);
kfree(tr_link);
}
static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
struct seq_file *seq)
{
struct bpf_tracing_link *tr_link =
container_of(link, struct bpf_tracing_link, link);
seq_printf(seq,
"attach_type:\t%d\n",
tr_link->attach_type);
}
static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
struct bpf_link_info *info)
{
struct bpf_tracing_link *tr_link =
container_of(link, struct bpf_tracing_link, link);
info->tracing.attach_type = tr_link->attach_type;
bpf_trampoline_unpack_key(tr_link->trampoline->key,
&info->tracing.target_obj_id,
&info->tracing.target_btf_id);
return 0;
}
static const struct bpf_link_ops bpf_tracing_link_lops = {
.release = bpf_tracing_link_release,
.dealloc = bpf_tracing_link_dealloc,
.show_fdinfo = bpf_tracing_link_show_fdinfo,
.fill_link_info = bpf_tracing_link_fill_link_info,
};
static int bpf_tracing_prog_attach(struct bpf_prog *prog,
int tgt_prog_fd,
u32 btf_id)
{
struct bpf_link_primer link_primer;
struct bpf_prog *tgt_prog = NULL;
struct bpf_trampoline *tr = NULL;
struct bpf_tracing_link *link;
u64 key = 0;
int err;
switch (prog->type) {
case BPF_PROG_TYPE_TRACING:
if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
prog->expected_attach_type != BPF_TRACE_FEXIT &&
prog->expected_attach_type != BPF_MODIFY_RETURN) {
err = -EINVAL;
goto out_put_prog;
}
break;
case BPF_PROG_TYPE_EXT:
if (prog->expected_attach_type != 0) {
err = -EINVAL;
goto out_put_prog;
}
break;
case BPF_PROG_TYPE_LSM:
if (prog->expected_attach_type != BPF_LSM_MAC) {
err = -EINVAL;
goto out_put_prog;
}
break;
default:
err = -EINVAL;
goto out_put_prog;
}
if (!!tgt_prog_fd != !!btf_id) {
err = -EINVAL;
goto out_put_prog;
}
if (tgt_prog_fd) {
/* For now we only allow new targets for BPF_PROG_TYPE_EXT */
if (prog->type != BPF_PROG_TYPE_EXT) {
err = -EINVAL;
goto out_put_prog;
}
tgt_prog = bpf_prog_get(tgt_prog_fd);
if (IS_ERR(tgt_prog)) {
err = PTR_ERR(tgt_prog);
tgt_prog = NULL;
goto out_put_prog;
}
key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id);
}
link = kzalloc(sizeof(*link), GFP_USER);
if (!link) {
err = -ENOMEM;
goto out_put_prog;
}
bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING,
&bpf_tracing_link_lops, prog);
link->attach_type = prog->expected_attach_type;
mutex_lock(&prog->aux->dst_mutex);
/* There are a few possible cases here:
*
* - if prog->aux->dst_trampoline is set, the program was just loaded
* and not yet attached to anything, so we can use the values stored
* in prog->aux
*
* - if prog->aux->dst_trampoline is NULL, the program has already been
* attached to a target and its initial target was cleared (below)
*
* - if tgt_prog != NULL, the caller specified tgt_prog_fd +
* target_btf_id using the link_create API.
*
* - if tgt_prog == NULL when this function was called using the old
* raw_tracepoint_open API, and we need a target from prog->aux
*
* - if prog->aux->dst_trampoline and tgt_prog is NULL, the program
* was detached and is going for re-attachment.
*/
if (!prog->aux->dst_trampoline && !tgt_prog) {
/*
* Allow re-attach for TRACING and LSM programs. If it's
* currently linked, bpf_trampoline_link_prog will fail.
* EXT programs need to specify tgt_prog_fd, so they
* re-attach in separate code path.
*/
if (prog->type != BPF_PROG_TYPE_TRACING &&
prog->type != BPF_PROG_TYPE_LSM) {
err = -EINVAL;
goto out_unlock;
}
btf_id = prog->aux->attach_btf_id;
key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, btf_id);
}
if (!prog->aux->dst_trampoline ||
(key && key != prog->aux->dst_trampoline->key)) {
/* If there is no saved target, or the specified target is
* different from the destination specified at load time, we
* need a new trampoline and a check for compatibility
*/
struct bpf_attach_target_info tgt_info = {};
err = bpf_check_attach_target(NULL, prog, tgt_prog, btf_id,
&tgt_info);
if (err)
goto out_unlock;
tr = bpf_trampoline_get(key, &tgt_info);
if (!tr) {
err = -ENOMEM;
goto out_unlock;
}
} else {
/* The caller didn't specify a target, or the target was the
* same as the destination supplied during program load. This
* means we can reuse the trampoline and reference from program
* load time, and there is no need to allocate a new one. This
* can only happen once for any program, as the saved values in
* prog->aux are cleared below.
*/
tr = prog->aux->dst_trampoline;
tgt_prog = prog->aux->dst_prog;
}
err = bpf_link_prime(&link->link, &link_primer);
if (err)
goto out_unlock;
err = bpf_trampoline_link_prog(prog, tr);
if (err) {
bpf_link_cleanup(&link_primer);
link = NULL;
goto out_unlock;
}
link->tgt_prog = tgt_prog;
link->trampoline = tr;
/* Always clear the trampoline and target prog from prog->aux to make
* sure the original attach destination is not kept alive after a
* program is (re-)attached to another target.
*/
if (prog->aux->dst_prog &&
(tgt_prog_fd || tr != prog->aux->dst_trampoline))
/* got extra prog ref from syscall, or attaching to different prog */
bpf_prog_put(prog->aux->dst_prog);
if (prog->aux->dst_trampoline && tr != prog->aux->dst_trampoline)
/* we allocated a new trampoline, so free the old one */
bpf_trampoline_put(prog->aux->dst_trampoline);
prog->aux->dst_prog = NULL;
prog->aux->dst_trampoline = NULL;
mutex_unlock(&prog->aux->dst_mutex);
return bpf_link_settle(&link_primer);
out_unlock:
if (tr && tr != prog->aux->dst_trampoline)
bpf_trampoline_put(tr);
mutex_unlock(&prog->aux->dst_mutex);
kfree(link);
out_put_prog:
if (tgt_prog_fd && tgt_prog)
bpf_prog_put(tgt_prog);
return err;
}
struct bpf_raw_tp_link {
struct bpf_link link;
struct bpf_raw_event_map *btp;
};
static void bpf_raw_tp_link_release(struct bpf_link *link)
{
struct bpf_raw_tp_link *raw_tp =
container_of(link, struct bpf_raw_tp_link, link);
bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog);
bpf_put_raw_tracepoint(raw_tp->btp);
}
static void bpf_raw_tp_link_dealloc(struct bpf_link *link)
{
struct bpf_raw_tp_link *raw_tp =
container_of(link, struct bpf_raw_tp_link, link);
kfree(raw_tp);
}
static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link,
struct seq_file *seq)
{
struct bpf_raw_tp_link *raw_tp_link =
container_of(link, struct bpf_raw_tp_link, link);
seq_printf(seq,
"tp_name:\t%s\n",
raw_tp_link->btp->tp->name);
}
static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
struct bpf_link_info *info)
{
struct bpf_raw_tp_link *raw_tp_link =
container_of(link, struct bpf_raw_tp_link, link);
char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name);
const char *tp_name = raw_tp_link->btp->tp->name;
u32 ulen = info->raw_tracepoint.tp_name_len;
size_t tp_len = strlen(tp_name);
if (!ulen ^ !ubuf)
return -EINVAL;
info->raw_tracepoint.tp_name_len = tp_len + 1;
if (!ubuf)
return 0;
if (ulen >= tp_len + 1) {
if (copy_to_user(ubuf, tp_name, tp_len + 1))
return -EFAULT;
} else {
char zero = '\0';
if (copy_to_user(ubuf, tp_name, ulen - 1))
return -EFAULT;
if (put_user(zero, ubuf + ulen - 1))
return -EFAULT;
return -ENOSPC;
}
return 0;
}
static const struct bpf_link_ops bpf_raw_tp_link_lops = {
.release = bpf_raw_tp_link_release,
.dealloc = bpf_raw_tp_link_dealloc,
.show_fdinfo = bpf_raw_tp_link_show_fdinfo,
.fill_link_info = bpf_raw_tp_link_fill_link_info,
};
#ifdef CONFIG_PERF_EVENTS
struct bpf_perf_link {
struct bpf_link link;
struct file *perf_file;
};
static void bpf_perf_link_release(struct bpf_link *link)
{
struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
struct perf_event *event = perf_link->perf_file->private_data;
perf_event_free_bpf_prog(event);
fput(perf_link->perf_file);
}
static void bpf_perf_link_dealloc(struct bpf_link *link)
{
struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link);
kfree(perf_link);
}
static const struct bpf_link_ops bpf_perf_link_lops = {
.release = bpf_perf_link_release,
.dealloc = bpf_perf_link_dealloc,
};
static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
struct bpf_link_primer link_primer;
struct bpf_perf_link *link;
struct perf_event *event;
struct file *perf_file;
int err;
if (attr->link_create.flags)
return -EINVAL;
perf_file = perf_event_get(attr->link_create.target_fd);
if (IS_ERR(perf_file))
return PTR_ERR(perf_file);
link = kzalloc(sizeof(*link), GFP_USER);
if (!link) {
err = -ENOMEM;
goto out_put_file;
}
bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog);
link->perf_file = perf_file;
err = bpf_link_prime(&link->link, &link_primer);
if (err) {
kfree(link);
goto out_put_file;
}
event = perf_file->private_data;
err = perf_event_set_bpf_prog(event, prog, attr->link_create.perf_event.bpf_cookie);
if (err) {
bpf_link_cleanup(&link_primer);
goto out_put_file;
}
/* perf_event_set_bpf_prog() doesn't take its own refcnt on prog */
bpf_prog_inc(prog);
return bpf_link_settle(&link_primer);
out_put_file:
fput(perf_file);
return err;
}
#endif /* CONFIG_PERF_EVENTS */
#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
{
struct bpf_link_primer link_primer;
struct bpf_raw_tp_link *link;
struct bpf_raw_event_map *btp;
struct bpf_prog *prog;
const char *tp_name;
char buf[128];
int err;
if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
return -EINVAL;
prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
if (IS_ERR(prog))
return PTR_ERR(prog);
switch (prog->type) {
case BPF_PROG_TYPE_TRACING:
case BPF_PROG_TYPE_EXT:
case BPF_PROG_TYPE_LSM:
if (attr->raw_tracepoint.name) {
/* The attach point for this category of programs
* should be specified via btf_id during program load.
*/
err = -EINVAL;
goto out_put_prog;
}
if (prog->type == BPF_PROG_TYPE_TRACING &&
prog->expected_attach_type == BPF_TRACE_RAW_TP) {
tp_name = prog->aux->attach_func_name;
break;
}
err = bpf_tracing_prog_attach(prog, 0, 0);
if (err >= 0)
return err;
goto out_put_prog;
case BPF_PROG_TYPE_RAW_TRACEPOINT:
case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
if (strncpy_from_user(buf,
u64_to_user_ptr(attr->raw_tracepoint.name),
sizeof(buf) - 1) < 0) {
err = -EFAULT;
goto out_put_prog;
}
buf[sizeof(buf) - 1] = 0;
tp_name = buf;
break;
default:
err = -EINVAL;
goto out_put_prog;
}
btp = bpf_get_raw_tracepoint(tp_name);
if (!btp) {
err = -ENOENT;
goto out_put_prog;
}
link = kzalloc(sizeof(*link), GFP_USER);
if (!link) {
err = -ENOMEM;
goto out_put_btp;
}
bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
&bpf_raw_tp_link_lops, prog);
link->btp = btp;
err = bpf_link_prime(&link->link, &link_primer);
if (err) {
kfree(link);
goto out_put_btp;
}
err = bpf_probe_register(link->btp, prog);
if (err) {
bpf_link_cleanup(&link_primer);
goto out_put_btp;
}
return bpf_link_settle(&link_primer);
out_put_btp:
bpf_put_raw_tracepoint(btp);
out_put_prog:
bpf_prog_put(prog);
return err;
}
static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
enum bpf_attach_type attach_type)
{
switch (prog->type) {
case BPF_PROG_TYPE_CGROUP_SOCK:
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
case BPF_PROG_TYPE_SK_LOOKUP:
return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
case BPF_PROG_TYPE_CGROUP_SKB:
if (!capable(CAP_NET_ADMIN))
/* cg-skb progs can be loaded by unpriv user.
* check permissions at attach time.
*/
return -EPERM;
return prog->enforce_expected_attach_type &&
prog->expected_attach_type != attach_type ?
-EINVAL : 0;
default:
return 0;
}
}
static enum bpf_prog_type
attach_type_to_prog_type(enum bpf_attach_type attach_type)
{
switch (attach_type) {
case BPF_CGROUP_INET_INGRESS:
case BPF_CGROUP_INET_EGRESS:
return BPF_PROG_TYPE_CGROUP_SKB;
case BPF_CGROUP_INET_SOCK_CREATE:
case BPF_CGROUP_INET_SOCK_RELEASE:
case BPF_CGROUP_INET4_POST_BIND:
case BPF_CGROUP_INET6_POST_BIND:
return BPF_PROG_TYPE_CGROUP_SOCK;
case BPF_CGROUP_INET4_BIND:
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
case BPF_CGROUP_INET4_GETPEERNAME:
case BPF_CGROUP_INET6_GETPEERNAME:
case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_INET6_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
case BPF_CGROUP_UDP6_RECVMSG:
return BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
case BPF_CGROUP_SOCK_OPS:
return BPF_PROG_TYPE_SOCK_OPS;
case BPF_CGROUP_DEVICE:
return BPF_PROG_TYPE_CGROUP_DEVICE;
case BPF_SK_MSG_VERDICT:
return BPF_PROG_TYPE_SK_MSG;
case BPF_SK_SKB_STREAM_PARSER:
case BPF_SK_SKB_STREAM_VERDICT:
case BPF_SK_SKB_VERDICT:
return BPF_PROG_TYPE_SK_SKB;
case BPF_LIRC_MODE2:
return BPF_PROG_TYPE_LIRC_MODE2;
case BPF_FLOW_DISSECTOR:
return BPF_PROG_TYPE_FLOW_DISSECTOR;
case BPF_CGROUP_SYSCTL:
return BPF_PROG_TYPE_CGROUP_SYSCTL;
case BPF_CGROUP_GETSOCKOPT:
case BPF_CGROUP_SETSOCKOPT:
return BPF_PROG_TYPE_CGROUP_SOCKOPT;
case BPF_TRACE_ITER:
return BPF_PROG_TYPE_TRACING;
case BPF_SK_LOOKUP:
return BPF_PROG_TYPE_SK_LOOKUP;
case BPF_XDP:
return BPF_PROG_TYPE_XDP;
default:
return BPF_PROG_TYPE_UNSPEC;
}
}
#define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd
#define BPF_F_ATTACH_MASK \
(BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE)
static int bpf_prog_attach(const union bpf_attr *attr)
{
enum bpf_prog_type ptype;
struct bpf_prog *prog;
int ret;
if (CHECK_ATTR(BPF_PROG_ATTACH))
return -EINVAL;
if (attr->attach_flags & ~BPF_F_ATTACH_MASK)
return -EINVAL;
ptype = attach_type_to_prog_type(attr->attach_type);
if (ptype == BPF_PROG_TYPE_UNSPEC)
return -EINVAL;
prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
if (IS_ERR(prog))
return PTR_ERR(prog);
if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) {
bpf_prog_put(prog);
return -EINVAL;
}
switch (ptype) {
case BPF_PROG_TYPE_SK_SKB:
case BPF_PROG_TYPE_SK_MSG:
ret = sock_map_get_from_fd(attr, prog);
break;
case BPF_PROG_TYPE_LIRC_MODE2:
ret = lirc_prog_attach(attr, prog);
break;
case BPF_PROG_TYPE_FLOW_DISSECTOR:
ret = netns_bpf_prog_attach(attr, prog);
break;
case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SKB:
case BPF_PROG_TYPE_CGROUP_SOCK:
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
case BPF_PROG_TYPE_SOCK_OPS:
ret = cgroup_bpf_prog_attach(attr, ptype, prog);
break;
default:
ret = -EINVAL;
}
if (ret)
bpf_prog_put(prog);
return ret;
}
#define BPF_PROG_DETACH_LAST_FIELD attach_type
static int bpf_prog_detach(const union bpf_attr *attr)
{
enum bpf_prog_type ptype;
if (CHECK_ATTR(BPF_PROG_DETACH))
return -EINVAL;
ptype = attach_type_to_prog_type(attr->attach_type);
switch (ptype) {
case BPF_PROG_TYPE_SK_MSG:
case BPF_PROG_TYPE_SK_SKB:
return sock_map_prog_detach(attr, ptype);
case BPF_PROG_TYPE_LIRC_MODE2:
return lirc_prog_detach(attr);
case BPF_PROG_TYPE_FLOW_DISSECTOR:
return netns_bpf_prog_detach(attr, ptype);
case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SKB:
case BPF_PROG_TYPE_CGROUP_SOCK:
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
case BPF_PROG_TYPE_SOCK_OPS:
return cgroup_bpf_prog_detach(attr, ptype);
default:
return -EINVAL;
}
}
#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt
static int bpf_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
if (!capable(CAP_NET_ADMIN))
return -EPERM;
if (CHECK_ATTR(BPF_PROG_QUERY))
return -EINVAL;
if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
return -EINVAL;
switch (attr->query.attach_type) {
case BPF_CGROUP_INET_INGRESS:
case BPF_CGROUP_INET_EGRESS:
case BPF_CGROUP_INET_SOCK_CREATE:
case BPF_CGROUP_INET_SOCK_RELEASE:
case BPF_CGROUP_INET4_BIND:
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_POST_BIND:
case BPF_CGROUP_INET6_POST_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
case BPF_CGROUP_INET4_GETPEERNAME:
case BPF_CGROUP_INET6_GETPEERNAME:
case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_INET6_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
case BPF_CGROUP_UDP6_RECVMSG:
case BPF_CGROUP_SOCK_OPS:
case BPF_CGROUP_DEVICE:
case BPF_CGROUP_SYSCTL:
case BPF_CGROUP_GETSOCKOPT:
case BPF_CGROUP_SETSOCKOPT:
return cgroup_bpf_prog_query(attr, uattr);
case BPF_LIRC_MODE2:
return lirc_prog_query(attr, uattr);
case BPF_FLOW_DISSECTOR:
case BPF_SK_LOOKUP:
return netns_bpf_prog_query(attr, uattr);
default:
return -EINVAL;
}
}
#define BPF_PROG_TEST_RUN_LAST_FIELD test.cpu
static int bpf_prog_test_run(const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
struct bpf_prog *prog;
int ret = -ENOTSUPP;
if (CHECK_ATTR(BPF_PROG_TEST_RUN))
return -EINVAL;
if ((attr->test.ctx_size_in && !attr->test.ctx_in) ||
(!attr->test.ctx_size_in && attr->test.ctx_in))
return -EINVAL;
if ((attr->test.ctx_size_out && !attr->test.ctx_out) ||
(!attr->test.ctx_size_out && attr->test.ctx_out))
return -EINVAL;
prog = bpf_prog_get(attr->test.prog_fd);
if (IS_ERR(prog))
return PTR_ERR(prog);
if (prog->aux->ops->test_run)
ret = prog->aux->ops->test_run(prog, attr, uattr);
bpf_prog_put(prog);
return ret;
}
#define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id
static int bpf_obj_get_next_id(const union bpf_attr *attr,
union bpf_attr __user *uattr,
struct idr *idr,
spinlock_t *lock)
{
u32 next_id = attr->start_id;
int err = 0;
if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX)
return -EINVAL;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
next_id++;
spin_lock_bh(lock);
if (!idr_get_next(idr, &next_id))
err = -ENOENT;
spin_unlock_bh(lock);
if (!err)
err = put_user(next_id, &uattr->next_id);
return err;
}
struct bpf_map *bpf_map_get_curr_or_next(u32 *id)
{
struct bpf_map *map;
spin_lock_bh(&map_idr_lock);
again:
map = idr_get_next(&map_idr, id);
if (map) {
map = __bpf_map_inc_not_zero(map, false);
if (IS_ERR(map)) {
(*id)++;
goto again;
}
}
spin_unlock_bh(&map_idr_lock);
return map;
}
struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id)
{
struct bpf_prog *prog;
spin_lock_bh(&prog_idr_lock);
again:
prog = idr_get_next(&prog_idr, id);
if (prog) {
prog = bpf_prog_inc_not_zero(prog);
if (IS_ERR(prog)) {
(*id)++;
goto again;
}
}
spin_unlock_bh(&prog_idr_lock);
return prog;
}
#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
struct bpf_prog *bpf_prog_by_id(u32 id)
{
struct bpf_prog *prog;
if (!id)
return ERR_PTR(-ENOENT);
spin_lock_bh(&prog_idr_lock);
prog = idr_find(&prog_idr, id);
if (prog)
prog = bpf_prog_inc_not_zero(prog);
else
prog = ERR_PTR(-ENOENT);
spin_unlock_bh(&prog_idr_lock);
return prog;
}
static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
{
struct bpf_prog *prog;
u32 id = attr->prog_id;
int fd;
if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
return -EINVAL;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
prog = bpf_prog_by_id(id);
if (IS_ERR(prog))
return PTR_ERR(prog);
fd = bpf_prog_new_fd(prog);
if (fd < 0)
bpf_prog_put(prog);
return fd;
}
#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags
static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
{
struct bpf_map *map;
u32 id = attr->map_id;
int f_flags;
int fd;
if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) ||
attr->open_flags & ~BPF_OBJ_FLAG_MASK)
return -EINVAL;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
f_flags = bpf_get_file_flag(attr->open_flags);
if (f_flags < 0)
return f_flags;
spin_lock_bh(&map_idr_lock);
map = idr_find(&map_idr, id);
if (map)
map = __bpf_map_inc_not_zero(map, true);
else
map = ERR_PTR(-ENOENT);
spin_unlock_bh(&map_idr_lock);
if (IS_ERR(map))
return PTR_ERR(map);
fd = bpf_map_new_fd(map, f_flags);
if (fd < 0)
bpf_map_put_with_uref(map);
return fd;
}
static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog,
unsigned long addr, u32 *off,
u32 *type)
{
const struct bpf_map *map;
int i;
mutex_lock(&prog->aux->used_maps_mutex);
for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) {
map = prog->aux->used_maps[i];
if (map == (void *)addr) {
*type = BPF_PSEUDO_MAP_FD;
goto out;
}
if (!map->ops->map_direct_value_meta)
continue;
if (!map->ops->map_direct_value_meta(map, addr, off)) {
*type = BPF_PSEUDO_MAP_VALUE;
goto out;
}
}
map = NULL;
out:
mutex_unlock(&prog->aux->used_maps_mutex);
return map;
}
static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog,
const struct cred *f_cred)
{
const struct bpf_map *map;
struct bpf_insn *insns;
u32 off, type;
u64 imm;
u8 code;
int i;
insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog),
GFP_USER);
if (!insns)
return insns;
for (i = 0; i < prog->len; i++) {
code = insns[i].code;
if (code == (BPF_JMP | BPF_TAIL_CALL)) {
insns[i].code = BPF_JMP | BPF_CALL;
insns[i].imm = BPF_FUNC_tail_call;
/* fall-through */
}
if (code == (BPF_JMP | BPF_CALL) ||
code == (BPF_JMP | BPF_CALL_ARGS)) {
if (code == (BPF_JMP | BPF_CALL_ARGS))
insns[i].code = BPF_JMP | BPF_CALL;
if (!bpf_dump_raw_ok(f_cred))
insns[i].imm = 0;
continue;
}
if (BPF_CLASS(code) == BPF_LDX && BPF_MODE(code) == BPF_PROBE_MEM) {
insns[i].code = BPF_LDX | BPF_SIZE(code) | BPF_MEM;
continue;
}
if (code != (BPF_LD | BPF_IMM | BPF_DW))
continue;
imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm;
map = bpf_map_from_imm(prog, imm, &off, &type);
if (map) {
insns[i].src_reg = type;
insns[i].imm = map->id;
insns[i + 1].imm = off;
continue;
}
}
return insns;
}
static int set_info_rec_size(struct bpf_prog_info *info)
{
/*
* Ensure info.*_rec_size is the same as kernel expected size
*
* or
*
* Only allow zero *_rec_size if both _rec_size and _cnt are
* zero. In this case, the kernel will set the expected
* _rec_size back to the info.
*/
if ((info->nr_func_info || info->func_info_rec_size) &&
info->func_info_rec_size != sizeof(struct bpf_func_info))
return -EINVAL;
if ((info->nr_line_info || info->line_info_rec_size) &&
info->line_info_rec_size != sizeof(struct bpf_line_info))
return -EINVAL;
if ((info->nr_jited_line_info || info->jited_line_info_rec_size) &&
info->jited_line_info_rec_size != sizeof(__u64))
return -EINVAL;
info->func_info_rec_size = sizeof(struct bpf_func_info);
info->line_info_rec_size = sizeof(struct bpf_line_info);
info->jited_line_info_rec_size = sizeof(__u64);
return 0;
}
static int bpf_prog_get_info_by_fd(struct file *file,
struct bpf_prog *prog,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
struct bpf_prog_info info;
u32 info_len = attr->info.info_len;
struct bpf_prog_kstats stats;
char __user *uinsns;
u32 ulen;
int err;
err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
if (err)
return err;
info_len = min_t(u32, sizeof(info), info_len);
memset(&info, 0, sizeof(info));
if (copy_from_user(&info, uinfo, info_len))
return -EFAULT;
info.type = prog->type;
info.id = prog->aux->id;
info.load_time = prog->aux->load_time;
info.created_by_uid = from_kuid_munged(current_user_ns(),
prog->aux->user->uid);
info.gpl_compatible = prog->gpl_compatible;
memcpy(info.tag, prog->tag, sizeof(prog->tag));
memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
mutex_lock(&prog->aux->used_maps_mutex);
ulen = info.nr_map_ids;
info.nr_map_ids = prog->aux->used_map_cnt;
ulen = min_t(u32, info.nr_map_ids, ulen);
if (ulen) {
u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids);
u32 i;
for (i = 0; i < ulen; i++)
if (put_user(prog->aux->used_maps[i]->id,
&user_map_ids[i])) {
mutex_unlock(&prog->aux->used_maps_mutex);
return -EFAULT;
}
}
mutex_unlock(&prog->aux->used_maps_mutex);
err = set_info_rec_size(&info);
if (err)
return err;
bpf_prog_get_stats(prog, &stats);
info.run_time_ns = stats.nsecs;
info.run_cnt = stats.cnt;
info.recursion_misses = stats.misses;
if (!bpf_capable()) {
info.jited_prog_len = 0;
info.xlated_prog_len = 0;
info.nr_jited_ksyms = 0;
info.nr_jited_func_lens = 0;
info.nr_func_info = 0;
info.nr_line_info = 0;
info.nr_jited_line_info = 0;
goto done;
}
ulen = info.xlated_prog_len;
info.xlated_prog_len = bpf_prog_insn_size(prog);
if (info.xlated_prog_len && ulen) {
struct bpf_insn *insns_sanitized;
bool fault;
if (prog->blinded && !bpf_dump_raw_ok(file->f_cred)) {
info.xlated_prog_insns = 0;
goto done;
}
insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred);
if (!insns_sanitized)
return -ENOMEM;
uinsns = u64_to_user_ptr(info.xlated_prog_insns);
ulen = min_t(u32, info.xlated_prog_len, ulen);
fault = copy_to_user(uinsns, insns_sanitized, ulen);
kfree(insns_sanitized);
if (fault)
return -EFAULT;
}
if (bpf_prog_is_dev_bound(prog->aux)) {
err = bpf_prog_offload_info_fill(&info, prog);
if (err)
return err;
goto done;
}
/* NOTE: the following code is supposed to be skipped for offload.
* bpf_prog_offload_info_fill() is the place to fill similar fields
* for offload.
*/
ulen = info.jited_prog_len;
if (prog->aux->func_cnt) {
u32 i;
info.jited_prog_len = 0;
for (i = 0; i < prog->aux->func_cnt; i++)
info.jited_prog_len += prog->aux->func[i]->jited_len;
} else {
info.jited_prog_len = prog->jited_len;
}
if (info.jited_prog_len && ulen) {
if (bpf_dump_raw_ok(file->f_cred)) {
uinsns = u64_to_user_ptr(info.jited_prog_insns);
ulen = min_t(u32, info.jited_prog_len, ulen);
/* for multi-function programs, copy the JITed
* instructions for all the functions
*/
if (prog->aux->func_cnt) {
u32 len, free, i;
u8 *img;
free = ulen;
for (i = 0; i < prog->aux->func_cnt; i++) {
len = prog->aux->func[i]->jited_len;
len = min_t(u32, len, free);
img = (u8 *) prog->aux->func[i]->bpf_func;
if (copy_to_user(uinsns, img, len))
return -EFAULT;
uinsns += len;
free -= len;
if (!free)
break;
}
} else {
if (copy_to_user(uinsns, prog->bpf_func, ulen))
return -EFAULT;
}
} else {
info.jited_prog_insns = 0;
}
}
ulen = info.nr_jited_ksyms;
info.nr_jited_ksyms = prog->aux->func_cnt ? : 1;
if (ulen) {
if (bpf_dump_raw_ok(file->f_cred)) {
unsigned long ksym_addr;
u64 __user *user_ksyms;
u32 i;
/* copy the address of the kernel symbol
* corresponding to each function
*/
ulen = min_t(u32, info.nr_jited_ksyms, ulen);
user_ksyms = u64_to_user_ptr(info.jited_ksyms);
if (prog->aux->func_cnt) {
for (i = 0; i < ulen; i++) {
ksym_addr = (unsigned long)
prog->aux->func[i]->bpf_func;
if (put_user((u64) ksym_addr,
&user_ksyms[i]))
return -EFAULT;
}
} else {
ksym_addr = (unsigned long) prog->bpf_func;
if (put_user((u64) ksym_addr, &user_ksyms[0]))
return -EFAULT;
}
} else {
info.jited_ksyms = 0;
}
}
ulen = info.nr_jited_func_lens;
info.nr_jited_func_lens = prog->aux->func_cnt ? : 1;
if (ulen) {
if (bpf_dump_raw_ok(file->f_cred)) {
u32 __user *user_lens;
u32 func_len, i;
/* copy the JITed image lengths for each function */
ulen = min_t(u32, info.nr_jited_func_lens, ulen);
user_lens = u64_to_user_ptr(info.jited_func_lens);
if (prog->aux->func_cnt) {
for (i = 0; i < ulen; i++) {
func_len =
prog->aux->func[i]->jited_len;
if (put_user(func_len, &user_lens[i]))
return -EFAULT;
}
} else {
func_len = prog->jited_len;
if (put_user(func_len, &user_lens[0]))
return -EFAULT;
}
} else {
info.jited_func_lens = 0;
}
}
if (prog->aux->btf)
info.btf_id = btf_obj_id(prog->aux->btf);
ulen = info.nr_func_info;
info.nr_func_info = prog->aux->func_info_cnt;
if (info.nr_func_info && ulen) {
char __user *user_finfo;
user_finfo = u64_to_user_ptr(info.func_info);
ulen = min_t(u32, info.nr_func_info, ulen);
if (copy_to_user(user_finfo, prog->aux->func_info,
info.func_info_rec_size * ulen))
return -EFAULT;
}
ulen = info.nr_line_info;
info.nr_line_info = prog->aux->nr_linfo;
if (info.nr_line_info && ulen) {
__u8 __user *user_linfo;
user_linfo = u64_to_user_ptr(info.line_info);
ulen = min_t(u32, info.nr_line_info, ulen);
if (copy_to_user(user_linfo, prog->aux->linfo,
info.line_info_rec_size * ulen))
return -EFAULT;
}
ulen = info.nr_jited_line_info;
if (prog->aux->jited_linfo)
info.nr_jited_line_info = prog->aux->nr_linfo;
else
info.nr_jited_line_info = 0;
if (info.nr_jited_line_info && ulen) {
if (bpf_dump_raw_ok(file->f_cred)) {
__u64 __user *user_linfo;
u32 i;
user_linfo = u64_to_user_ptr(info.jited_line_info);
ulen = min_t(u32, info.nr_jited_line_info, ulen);
for (i = 0; i < ulen; i++) {
if (put_user((__u64)(long)prog->aux->jited_linfo[i],
&user_linfo[i]))
return -EFAULT;
}
} else {
info.jited_line_info = 0;
}
}
ulen = info.nr_prog_tags;
info.nr_prog_tags = prog->aux->func_cnt ? : 1;
if (ulen) {
__u8 __user (*user_prog_tags)[BPF_TAG_SIZE];
u32 i;
user_prog_tags = u64_to_user_ptr(info.prog_tags);
ulen = min_t(u32, info.nr_prog_tags, ulen);
if (prog->aux->func_cnt) {
for (i = 0; i < ulen; i++) {
if (copy_to_user(user_prog_tags[i],
prog->aux->func[i]->tag,
BPF_TAG_SIZE))
return -EFAULT;
}
} else {
if (copy_to_user(user_prog_tags[0],
prog->tag, BPF_TAG_SIZE))
return -EFAULT;
}
}
done:
if (copy_to_user(uinfo, &info, info_len) ||
put_user(info_len, &uattr->info.info_len))
return -EFAULT;
return 0;
}
static int bpf_map_get_info_by_fd(struct file *file,
struct bpf_map *map,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
struct bpf_map_info info;
u32 info_len = attr->info.info_len;
int err;
err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
if (err)
return err;
info_len = min_t(u32, sizeof(info), info_len);
memset(&info, 0, sizeof(info));
info.type = map->map_type;
info.id = map->id;
info.key_size = map->key_size;
info.value_size = map->value_size;
info.max_entries = map->max_entries;
info.map_flags = map->map_flags;
memcpy(info.name, map->name, sizeof(map->name));
if (map->btf) {
info.btf_id = btf_obj_id(map->btf);
info.btf_key_type_id = map->btf_key_type_id;
info.btf_value_type_id = map->btf_value_type_id;
}
info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
if (bpf_map_is_dev_bound(map)) {
err = bpf_map_offload_info_fill(&info, map);
if (err)
return err;
}
if (copy_to_user(uinfo, &info, info_len) ||
put_user(info_len, &uattr->info.info_len))
return -EFAULT;
return 0;
}
static int bpf_btf_get_info_by_fd(struct file *file,
struct btf *btf,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info);
u32 info_len = attr->info.info_len;
int err;
err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len);
if (err)
return err;
return btf_get_info_by_fd(btf, attr, uattr);
}
static int bpf_link_get_info_by_fd(struct file *file,
struct bpf_link *link,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info);
struct bpf_link_info info;
u32 info_len = attr->info.info_len;
int err;
err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len);
if (err)
return err;
info_len = min_t(u32, sizeof(info), info_len);
memset(&info, 0, sizeof(info));
if (copy_from_user(&info, uinfo, info_len))
return -EFAULT;
info.type = link->type;
info.id = link->id;
info.prog_id = link->prog->aux->id;
if (link->ops->fill_link_info) {
err = link->ops->fill_link_info(link, &info);
if (err)
return err;
}
if (copy_to_user(uinfo, &info, info_len) ||
put_user(info_len, &uattr->info.info_len))
return -EFAULT;
return 0;
}
#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
int ufd = attr->info.bpf_fd;
struct fd f;
int err;
if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
return -EINVAL;
f = fdget(ufd);
if (!f.file)
return -EBADFD;
if (f.file->f_op == &bpf_prog_fops)
err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr,
uattr);
else if (f.file->f_op == &bpf_map_fops)
err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr,
uattr);
else if (f.file->f_op == &btf_fops)
err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr);
else if (f.file->f_op == &bpf_link_fops)
err = bpf_link_get_info_by_fd(f.file, f.file->private_data,
attr, uattr);
else
err = -EINVAL;
fdput(f);
return err;
}
#define BPF_BTF_LOAD_LAST_FIELD btf_log_level
static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr)
{
if (CHECK_ATTR(BPF_BTF_LOAD))
return -EINVAL;
if (!bpf_capable())
return -EPERM;
return btf_new_fd(attr, uattr);
}
#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
{
if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
return -EINVAL;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
return btf_get_fd_by_id(attr->btf_id);
}
static int bpf_task_fd_query_copy(const union bpf_attr *attr,
union bpf_attr __user *uattr,
u32 prog_id, u32 fd_type,
const char *buf, u64 probe_offset,
u64 probe_addr)
{
char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
u32 len = buf ? strlen(buf) : 0, input_len;
int err = 0;
if (put_user(len, &uattr->task_fd_query.buf_len))
return -EFAULT;
input_len = attr->task_fd_query.buf_len;
if (input_len && ubuf) {
if (!len) {
/* nothing to copy, just make ubuf NULL terminated */
char zero = '\0';
if (put_user(zero, ubuf))
return -EFAULT;
} else if (input_len >= len + 1) {
/* ubuf can hold the string with NULL terminator */
if (copy_to_user(ubuf, buf, len + 1))
return -EFAULT;
} else {
/* ubuf cannot hold the string with NULL terminator,
* do a partial copy with NULL terminator.
*/
char zero = '\0';
err = -ENOSPC;
if (copy_to_user(ubuf, buf, input_len - 1))
return -EFAULT;
if (put_user(zero, ubuf + input_len - 1))
return -EFAULT;
}
}
if (put_user(prog_id, &uattr->task_fd_query.prog_id) ||
put_user(fd_type, &uattr->task_fd_query.fd_type) ||
put_user(probe_offset, &uattr->task_fd_query.probe_offset) ||
put_user(probe_addr, &uattr->task_fd_query.probe_addr))
return -EFAULT;
return err;
}
#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr
static int bpf_task_fd_query(const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
pid_t pid = attr->task_fd_query.pid;
u32 fd = attr->task_fd_query.fd;
const struct perf_event *event;
struct task_struct *task;
struct file *file;
int err;
if (CHECK_ATTR(BPF_TASK_FD_QUERY))
return -EINVAL;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (attr->task_fd_query.flags != 0)
return -EINVAL;
rcu_read_lock();
task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
rcu_read_unlock();
if (!task)
return -ENOENT;
err = 0;
file = fget_task(task, fd);
put_task_struct(task);
if (!file)
return -EBADF;
if (file->f_op == &bpf_link_fops) {
struct bpf_link *link = file->private_data;
if (link->ops == &bpf_raw_tp_link_lops) {
struct bpf_raw_tp_link *raw_tp =
container_of(link, struct bpf_raw_tp_link, link);
struct bpf_raw_event_map *btp = raw_tp->btp;
err = bpf_task_fd_query_copy(attr, uattr,
raw_tp->link.prog->aux->id,
BPF_FD_TYPE_RAW_TRACEPOINT,
btp->tp->name, 0, 0);
goto put_file;
}
goto out_not_supp;
}
event = perf_get_event(file);
if (!IS_ERR(event)) {
u64 probe_offset, probe_addr;
u32 prog_id, fd_type;
const char *buf;
err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
&buf, &probe_offset,
&probe_addr);
if (!err)
err = bpf_task_fd_query_copy(attr, uattr, prog_id,
fd_type, buf,
probe_offset,
probe_addr);
goto put_file;
}
out_not_supp:
err = -ENOTSUPP;
put_file:
fput(file);
return err;
}
#define BPF_MAP_BATCH_LAST_FIELD batch.flags
#define BPF_DO_BATCH(fn) \
do { \
if (!fn) { \
err = -ENOTSUPP; \
goto err_put; \
} \
err = fn(map, attr, uattr); \
} while (0)
static int bpf_map_do_batch(const union bpf_attr *attr,
union bpf_attr __user *uattr,
int cmd)
{
bool has_read = cmd == BPF_MAP_LOOKUP_BATCH ||
cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH;
bool has_write = cmd != BPF_MAP_LOOKUP_BATCH;
struct bpf_map *map;
int err, ufd;
struct fd f;
if (CHECK_ATTR(BPF_MAP_BATCH))
return -EINVAL;
ufd = attr->batch.map_fd;
f = fdget(ufd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
if (has_write)
bpf_map_write_active_inc(map);
if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
err = -EPERM;
goto err_put;
}
if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
goto err_put;
}
if (cmd == BPF_MAP_LOOKUP_BATCH)
BPF_DO_BATCH(map->ops->map_lookup_batch);
else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch);
else if (cmd == BPF_MAP_UPDATE_BATCH)
BPF_DO_BATCH(map->ops->map_update_batch);
else
BPF_DO_BATCH(map->ops->map_delete_batch);
err_put:
if (has_write)
bpf_map_write_active_dec(map);
fdput(f);
return err;
}
static int tracing_bpf_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
struct bpf_prog *prog)
{
if (attr->link_create.attach_type != prog->expected_attach_type)
return -EINVAL;
if (prog->expected_attach_type == BPF_TRACE_ITER)
return bpf_iter_link_attach(attr, uattr, prog);
else if (prog->type == BPF_PROG_TYPE_EXT)
return bpf_tracing_prog_attach(prog,
attr->link_create.target_fd,
attr->link_create.target_btf_id);
return -EINVAL;
}
#define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len
static int link_create(union bpf_attr *attr, bpfptr_t uattr)
{
enum bpf_prog_type ptype;
struct bpf_prog *prog;
int ret;
if (CHECK_ATTR(BPF_LINK_CREATE))
return -EINVAL;
prog = bpf_prog_get(attr->link_create.prog_fd);
if (IS_ERR(prog))
return PTR_ERR(prog);
ret = bpf_prog_attach_check_attach_type(prog,
attr->link_create.attach_type);
if (ret)
goto out;
switch (prog->type) {
case BPF_PROG_TYPE_EXT:
ret = tracing_bpf_link_attach(attr, uattr, prog);
goto out;
case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_KPROBE:
case BPF_PROG_TYPE_TRACEPOINT:
if (attr->link_create.attach_type != BPF_PERF_EVENT) {
ret = -EINVAL;
goto out;
}
ptype = prog->type;
break;
default:
ptype = attach_type_to_prog_type(attr->link_create.attach_type);
if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) {
ret = -EINVAL;
goto out;
}
break;
}
switch (ptype) {
case BPF_PROG_TYPE_CGROUP_SKB:
case BPF_PROG_TYPE_CGROUP_SOCK:
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
case BPF_PROG_TYPE_SOCK_OPS:
case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
ret = cgroup_bpf_link_attach(attr, prog);
break;
case BPF_PROG_TYPE_TRACING:
ret = tracing_bpf_link_attach(attr, uattr, prog);
break;
case BPF_PROG_TYPE_FLOW_DISSECTOR:
case BPF_PROG_TYPE_SK_LOOKUP:
ret = netns_bpf_link_create(attr, prog);
break;
#ifdef CONFIG_NET
case BPF_PROG_TYPE_XDP:
ret = bpf_xdp_link_attach(attr, prog);
break;
#endif
#ifdef CONFIG_PERF_EVENTS
case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_TRACEPOINT:
case BPF_PROG_TYPE_KPROBE:
ret = bpf_perf_link_attach(attr, prog);
break;
#endif
default:
ret = -EINVAL;
}
out:
if (ret < 0)
bpf_prog_put(prog);
return ret;
}
#define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
static int link_update(union bpf_attr *attr)
{
struct bpf_prog *old_prog = NULL, *new_prog;
struct bpf_link *link;
u32 flags;
int ret;
if (CHECK_ATTR(BPF_LINK_UPDATE))
return -EINVAL;
flags = attr->link_update.flags;
if (flags & ~BPF_F_REPLACE)
return -EINVAL;
link = bpf_link_get_from_fd(attr->link_update.link_fd);
if (IS_ERR(link))
return PTR_ERR(link);
new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
if (IS_ERR(new_prog)) {
ret = PTR_ERR(new_prog);
goto out_put_link;
}
if (flags & BPF_F_REPLACE) {
old_prog = bpf_prog_get(attr->link_update.old_prog_fd);
if (IS_ERR(old_prog)) {
ret = PTR_ERR(old_prog);
old_prog = NULL;
goto out_put_progs;
}
} else if (attr->link_update.old_prog_fd) {
ret = -EINVAL;
goto out_put_progs;
}
if (link->ops->update_prog)
ret = link->ops->update_prog(link, new_prog, old_prog);
else
ret = -EINVAL;
out_put_progs:
if (old_prog)
bpf_prog_put(old_prog);
if (ret)
bpf_prog_put(new_prog);
out_put_link:
bpf_link_put(link);
return ret;
}
#define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd
static int link_detach(union bpf_attr *attr)
{
struct bpf_link *link;
int ret;
if (CHECK_ATTR(BPF_LINK_DETACH))
return -EINVAL;
link = bpf_link_get_from_fd(attr->link_detach.link_fd);
if (IS_ERR(link))
return PTR_ERR(link);
if (link->ops->detach)
ret = link->ops->detach(link);
else
ret = -EOPNOTSUPP;
bpf_link_put(link);
return ret;
}
static struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
{
return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT);
}
struct bpf_link *bpf_link_by_id(u32 id)
{
struct bpf_link *link;
if (!id)
return ERR_PTR(-ENOENT);
spin_lock_bh(&link_idr_lock);
/* before link is "settled", ID is 0, pretend it doesn't exist yet */
link = idr_find(&link_idr, id);
if (link) {
if (link->id)
link = bpf_link_inc_not_zero(link);
else
link = ERR_PTR(-EAGAIN);
} else {
link = ERR_PTR(-ENOENT);
}
spin_unlock_bh(&link_idr_lock);
return link;
}
#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id
static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
{
struct bpf_link *link;
u32 id = attr->link_id;
int fd;
if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID))
return -EINVAL;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
link = bpf_link_by_id(id);
if (IS_ERR(link))
return PTR_ERR(link);
fd = bpf_link_new_fd(link);
if (fd < 0)
bpf_link_put(link);
return fd;
}
DEFINE_MUTEX(bpf_stats_enabled_mutex);
static int bpf_stats_release(struct inode *inode, struct file *file)
{
mutex_lock(&bpf_stats_enabled_mutex);
static_key_slow_dec(&bpf_stats_enabled_key.key);
mutex_unlock(&bpf_stats_enabled_mutex);
return 0;
}
static const struct file_operations bpf_stats_fops = {
.release = bpf_stats_release,
};
static int bpf_enable_runtime_stats(void)
{
int fd;
mutex_lock(&bpf_stats_enabled_mutex);
/* Set a very high limit to avoid overflow */
if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) {
mutex_unlock(&bpf_stats_enabled_mutex);
return -EBUSY;
}
fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC);
if (fd >= 0)
static_key_slow_inc(&bpf_stats_enabled_key.key);
mutex_unlock(&bpf_stats_enabled_mutex);
return fd;
}
#define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type
static int bpf_enable_stats(union bpf_attr *attr)
{
if (CHECK_ATTR(BPF_ENABLE_STATS))
return -EINVAL;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
switch (attr->enable_stats.type) {
case BPF_STATS_RUN_TIME:
return bpf_enable_runtime_stats();
default:
break;
}
return -EINVAL;
}
#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags
static int bpf_iter_create(union bpf_attr *attr)
{
struct bpf_link *link;
int err;
if (CHECK_ATTR(BPF_ITER_CREATE))
return -EINVAL;
if (attr->iter_create.flags)
return -EINVAL;
link = bpf_link_get_from_fd(attr->iter_create.link_fd);
if (IS_ERR(link))
return PTR_ERR(link);
err = bpf_iter_new_fd(link);
bpf_link_put(link);
return err;
}
#define BPF_PROG_BIND_MAP_LAST_FIELD prog_bind_map.flags
static int bpf_prog_bind_map(union bpf_attr *attr)
{
struct bpf_prog *prog;
struct bpf_map *map;
struct bpf_map **used_maps_old, **used_maps_new;
int i, ret = 0;
if (CHECK_ATTR(BPF_PROG_BIND_MAP))
return -EINVAL;
if (attr->prog_bind_map.flags)
return -EINVAL;
prog = bpf_prog_get(attr->prog_bind_map.prog_fd);
if (IS_ERR(prog))
return PTR_ERR(prog);
map = bpf_map_get(attr->prog_bind_map.map_fd);
if (IS_ERR(map)) {
ret = PTR_ERR(map);
goto out_prog_put;
}
mutex_lock(&prog->aux->used_maps_mutex);
used_maps_old = prog->aux->used_maps;
for (i = 0; i < prog->aux->used_map_cnt; i++)
if (used_maps_old[i] == map) {
bpf_map_put(map);
goto out_unlock;
}
used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1,
sizeof(used_maps_new[0]),
GFP_KERNEL);
if (!used_maps_new) {
ret = -ENOMEM;
goto out_unlock;
}
memcpy(used_maps_new, used_maps_old,
sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
used_maps_new[prog->aux->used_map_cnt] = map;
prog->aux->used_map_cnt++;
prog->aux->used_maps = used_maps_new;
kfree(used_maps_old);
out_unlock:
mutex_unlock(&prog->aux->used_maps_mutex);
if (ret)
bpf_map_put(map);
out_prog_put:
bpf_prog_put(prog);
return ret;
}
static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
{
union bpf_attr attr;
int err;
if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
return -EPERM;
err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
if (err)
return err;
size = min_t(u32, size, sizeof(attr));
/* copy attributes from user space, may be less than sizeof(bpf_attr) */
memset(&attr, 0, sizeof(attr));
if (copy_from_bpfptr(&attr, uattr, size) != 0)
return -EFAULT;
trace_android_vh_check_bpf_syscall(cmd, &attr, size);
err = security_bpf(cmd, &attr, size);
if (err < 0)
return err;
switch (cmd) {
case BPF_MAP_CREATE:
err = map_create(&attr);
break;
case BPF_MAP_LOOKUP_ELEM:
err = map_lookup_elem(&attr);
break;
case BPF_MAP_UPDATE_ELEM:
err = map_update_elem(&attr, uattr);
break;
case BPF_MAP_DELETE_ELEM:
err = map_delete_elem(&attr);
break;
case BPF_MAP_GET_NEXT_KEY:
err = map_get_next_key(&attr);
break;
case BPF_MAP_FREEZE:
err = map_freeze(&attr);
break;
case BPF_PROG_LOAD:
err = bpf_prog_load(&attr, uattr);
break;
case BPF_OBJ_PIN:
err = bpf_obj_pin(&attr);
break;
case BPF_OBJ_GET:
err = bpf_obj_get(&attr);
break;
case BPF_PROG_ATTACH:
err = bpf_prog_attach(&attr);
break;
case BPF_PROG_DETACH:
err = bpf_prog_detach(&attr);
break;
case BPF_PROG_QUERY:
err = bpf_prog_query(&attr, uattr.user);
break;
case BPF_PROG_TEST_RUN:
err = bpf_prog_test_run(&attr, uattr.user);
break;
case BPF_PROG_GET_NEXT_ID:
err = bpf_obj_get_next_id(&attr, uattr.user,
&prog_idr, &prog_idr_lock);
break;
case BPF_MAP_GET_NEXT_ID:
err = bpf_obj_get_next_id(&attr, uattr.user,
&map_idr, &map_idr_lock);
break;
case BPF_BTF_GET_NEXT_ID:
err = bpf_obj_get_next_id(&attr, uattr.user,
&btf_idr, &btf_idr_lock);
break;
case BPF_PROG_GET_FD_BY_ID:
err = bpf_prog_get_fd_by_id(&attr);
break;
case BPF_MAP_GET_FD_BY_ID:
err = bpf_map_get_fd_by_id(&attr);
break;
case BPF_OBJ_GET_INFO_BY_FD:
err = bpf_obj_get_info_by_fd(&attr, uattr.user);
break;
case BPF_RAW_TRACEPOINT_OPEN:
err = bpf_raw_tracepoint_open(&attr);
break;
case BPF_BTF_LOAD:
err = bpf_btf_load(&attr, uattr);
break;
case BPF_BTF_GET_FD_BY_ID:
err = bpf_btf_get_fd_by_id(&attr);
break;
case BPF_TASK_FD_QUERY:
err = bpf_task_fd_query(&attr, uattr.user);
break;
case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
err = map_lookup_and_delete_elem(&attr);
break;
case BPF_MAP_LOOKUP_BATCH:
err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_LOOKUP_BATCH);
break;
case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
err = bpf_map_do_batch(&attr, uattr.user,
BPF_MAP_LOOKUP_AND_DELETE_BATCH);
break;
case BPF_MAP_UPDATE_BATCH:
err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_UPDATE_BATCH);
break;
case BPF_MAP_DELETE_BATCH:
err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_DELETE_BATCH);
break;
case BPF_LINK_CREATE:
err = link_create(&attr, uattr);
break;
case BPF_LINK_UPDATE:
err = link_update(&attr);
break;
case BPF_LINK_GET_FD_BY_ID:
err = bpf_link_get_fd_by_id(&attr);
break;
case BPF_LINK_GET_NEXT_ID:
err = bpf_obj_get_next_id(&attr, uattr.user,
&link_idr, &link_idr_lock);
break;
case BPF_ENABLE_STATS:
err = bpf_enable_stats(&attr);
break;
case BPF_ITER_CREATE:
err = bpf_iter_create(&attr);
break;
case BPF_LINK_DETACH:
err = link_detach(&attr);
break;
case BPF_PROG_BIND_MAP:
err = bpf_prog_bind_map(&attr);
break;
default:
err = -EINVAL;
break;
}
return err;
}
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
return __sys_bpf(cmd, USER_BPFPTR(uattr), size);
}
static bool syscall_prog_is_valid_access(int off, int size,
enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
if (off < 0 || off >= U16_MAX)
return false;
if (off % size != 0)
return false;
return true;
}
BPF_CALL_3(bpf_sys_bpf, int, cmd, void *, attr, u32, attr_size)
{
switch (cmd) {
case BPF_MAP_CREATE:
case BPF_MAP_UPDATE_ELEM:
case BPF_MAP_FREEZE:
case BPF_PROG_LOAD:
case BPF_BTF_LOAD:
break;
/* case BPF_PROG_TEST_RUN:
* is not part of this list to prevent recursive test_run
*/
default:
return -EINVAL;
}
return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size);
}
static const struct bpf_func_proto bpf_sys_bpf_proto = {
.func = bpf_sys_bpf,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
};
const struct bpf_func_proto * __weak
tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
return bpf_base_func_proto(func_id);
}
BPF_CALL_1(bpf_sys_close, u32, fd)
{
/* When bpf program calls this helper there should not be
* an fdget() without matching completed fdput().
* This helper is allowed in the following callchain only:
* sys_bpf->prog_test_run->bpf_prog->bpf_sys_close
*/
return close_fd(fd);
}
static const struct bpf_func_proto bpf_sys_close_proto = {
.func = bpf_sys_close,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
};
static const struct bpf_func_proto *
syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_sys_bpf:
return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto;
case BPF_FUNC_btf_find_by_name_kind:
return &bpf_btf_find_by_name_kind_proto;
case BPF_FUNC_sys_close:
return &bpf_sys_close_proto;
default:
return tracing_prog_func_proto(func_id, prog);
}
}
const struct bpf_verifier_ops bpf_syscall_verifier_ops = {
.get_func_proto = syscall_prog_func_proto,
.is_valid_access = syscall_prog_is_valid_access,
};
const struct bpf_prog_ops bpf_syscall_prog_ops = {
.test_run = bpf_prog_test_run_syscall,
};