To avoid confusion, the terms "promotion" and "demotion" will be
applied to the multi-gen LRU, as a new convention; the terms
"activation" and "deactivation" will be applied to the active/inactive
LRU, as usual.
The aging produces young generations. Given an lruvec, it increments
max_seq when max_seq-min_seq+1 approaches MIN_NR_GENS. The aging
promotes hot pages to the youngest generation when it finds them
accessed through page tables; the demotion of cold pages happens
consequently when it increments max_seq. The aging has the complexity
O(nr_hot_pages), since it is only interested in hot pages. Promotion
in the aging path does not require any LRU list operations, only the
updates of the gen counter and lrugen->nr_pages[]; demotion, unless as
the result of the increment of max_seq, requires LRU list operations,
e.g., lru_deactivate_fn().
The eviction consumes old generations. Given an lruvec, it increments
min_seq when the lists indexed by min_seq%MAX_NR_GENS become empty. A
feedback loop modeled after the PID controller monitors refaults over
anon and file types and decides which type to evict when both types
are available from the same generation.
Each generation is divided into multiple tiers. Tiers represent
different ranges of numbers of accesses through file descriptors. A
page accessed N times through file descriptors is in tier
order_base_2(N). Tiers do not have dedicated lrugen->lists[], only
bits in page->flags. In contrast to moving across generations, which
requires the LRU lock, moving across tiers only involves operations on
page->flags. The feedback loop also monitors refaults over all tiers
and decides when to protect pages in which tiers (N>1), using the
first tier (N=0,1) as a baseline. The first tier contains single-use
unmapped clean pages, which are most likely the best choices. The
eviction moves a page to the next generation, i.e., min_seq+1, if the
feedback loop decides so. This approach has the following advantages:
1. It removes the cost of activation in the buffered access path by
inferring whether pages accessed multiple times through file
descriptors are statistically hot and thus worth protecting in the
eviction path.
2. It takes pages accessed through page tables into account and avoids
overprotecting pages accessed multiple times through file
descriptors. (Pages accessed through page tables are in the first
tier, since N=0.)
3. More tiers provide better protection for pages accessed more than
twice through file descriptors, when under heavy buffered I/O
workloads.
Server benchmark results:
Single workload:
fio (buffered I/O): +[38, 40]%
IOPS BW
5.18-ed4643521e6a: 2547k 9989MiB/s
patch1-6: 3540k 13.5GiB/s
Single workload:
memcached (anon): +[103, 107]%
Ops/sec KB/sec
5.18-ed4643521e6a: 469048.66 18243.91
patch1-6: 964656.80 37520.88
Configurations:
CPU: two Xeon 6154
Mem: total 256G
Node 1 was only used as a ram disk to reduce the variance in the
results.
patch drivers/block/brd.c <<EOF
99,100c99,100
< gfp_flags = GFP_NOIO | __GFP_ZERO | __GFP_HIGHMEM;
< page = alloc_page(gfp_flags);
---
> gfp_flags = GFP_NOIO | __GFP_ZERO | __GFP_HIGHMEM | __GFP_THISNODE;
> page = alloc_pages_node(1, gfp_flags, 0);
EOF
cat >>/etc/systemd/system.conf <<EOF
CPUAffinity=numa
NUMAPolicy=bind
NUMAMask=0
EOF
cat >>/etc/memcached.conf <<EOF
-m 184320
-s /var/run/memcached/memcached.sock
-a 0766
-t 36
-B binary
EOF
cat fio.sh
modprobe brd rd_nr=1 rd_size=113246208
swapoff -a
mkfs.ext4 /dev/ram0
mount -t ext4 /dev/ram0 /mnt
mkdir /sys/fs/cgroup/user.slice/test
echo 38654705664 >/sys/fs/cgroup/user.slice/test/memory.max
echo $$ >/sys/fs/cgroup/user.slice/test/cgroup.procs
fio -name=mglru --numjobs=72 --directory=/mnt --size=1408m \
--buffered=1 --ioengine=io_uring --iodepth=128 \
--iodepth_batch_submit=32 --iodepth_batch_complete=32 \
--rw=randread --random_distribution=random --norandommap \
--time_based --ramp_time=10m --runtime=5m --group_reporting
cat memcached.sh
modprobe brd rd_nr=1 rd_size=113246208
swapoff -a
mkswap /dev/ram0
swapon /dev/ram0
memtier_benchmark -S /var/run/memcached/memcached.sock \
-P memcache_binary -n allkeys --key-minimum=1 \
--key-maximum=65000000 --key-pattern=P:P -c 1 -t 36 \
--ratio 1:0 --pipeline 8 -d 2000
memtier_benchmark -S /var/run/memcached/memcached.sock \
-P memcache_binary -n allkeys --key-minimum=1 \
--key-maximum=65000000 --key-pattern=R:R -c 1 -t 36 \
--ratio 0:1 --pipeline 8 --randomize --distinct-client-seed
Client benchmark results:
kswapd profiles:
5.18-ed4643521e6a
39.56% page_vma_mapped_walk
19.32% lzo1x_1_do_compress (real work)
7.18% do_raw_spin_lock
4.23% _raw_spin_unlock_irq
2.26% vma_interval_tree_subtree_search
2.12% vma_interval_tree_iter_next
2.11% folio_referenced_one
1.90% anon_vma_interval_tree_iter_first
1.47% ptep_clear_flush
0.97% __anon_vma_interval_tree_subtree_search
patch1-6
36.13% lzo1x_1_do_compress (real work)
19.16% page_vma_mapped_walk
6.55% _raw_spin_unlock_irq
4.02% do_raw_spin_lock
2.32% anon_vma_interval_tree_iter_first
2.11% ptep_clear_flush
1.76% __zram_bvec_write
1.64% folio_referenced_one
1.40% memmove
1.35% obj_malloc
Configurations:
CPU: single Snapdragon 7c
Mem: total 4G
Chrome OS MemoryPressure [1]
[1] https://chromium.googlesource.com/chromiumos/platform/tast-tests/
Link: https://lore.kernel.org/lkml/20220309021230.721028-7-yuzhao@google.com/
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Bug: 227651406
Signed-off-by: Kalesh Singh <kaleshsingh@google.com>
Change-Id: I3fe4850006d7984cd9f4fd46134b826609dc2f86
404 lines
11 KiB
C
404 lines
11 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef LINUX_MM_INLINE_H
|
|
#define LINUX_MM_INLINE_H
|
|
|
|
#include <linux/huge_mm.h>
|
|
#include <linux/swap.h>
|
|
#include <linux/string.h>
|
|
|
|
/**
|
|
* page_is_file_lru - should the page be on a file LRU or anon LRU?
|
|
* @page: the page to test
|
|
*
|
|
* Returns 1 if @page is a regular filesystem backed page cache page or a lazily
|
|
* freed anonymous page (e.g. via MADV_FREE). Returns 0 if @page is a normal
|
|
* anonymous page, a tmpfs page or otherwise ram or swap backed page. Used by
|
|
* functions that manipulate the LRU lists, to sort a page onto the right LRU
|
|
* list.
|
|
*
|
|
* We would like to get this info without a page flag, but the state
|
|
* needs to survive until the page is last deleted from the LRU, which
|
|
* could be as far down as __page_cache_release.
|
|
*/
|
|
static inline int page_is_file_lru(struct page *page)
|
|
{
|
|
return !PageSwapBacked(page);
|
|
}
|
|
|
|
static __always_inline void __update_lru_size(struct lruvec *lruvec,
|
|
enum lru_list lru, enum zone_type zid,
|
|
long nr_pages)
|
|
{
|
|
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
|
|
|
lockdep_assert_held(&lruvec->lru_lock);
|
|
WARN_ON_ONCE(nr_pages != (int)nr_pages);
|
|
|
|
__mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
|
|
__mod_zone_page_state(&pgdat->node_zones[zid],
|
|
NR_ZONE_LRU_BASE + lru, nr_pages);
|
|
}
|
|
|
|
static __always_inline void update_lru_size(struct lruvec *lruvec,
|
|
enum lru_list lru, enum zone_type zid,
|
|
int nr_pages)
|
|
{
|
|
__update_lru_size(lruvec, lru, zid, nr_pages);
|
|
#ifdef CONFIG_MEMCG
|
|
mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
|
|
#endif
|
|
}
|
|
|
|
/**
|
|
* __clear_page_lru_flags - clear page lru flags before releasing a page
|
|
* @page: the page that was on lru and now has a zero reference
|
|
*/
|
|
static __always_inline void __clear_page_lru_flags(struct page *page)
|
|
{
|
|
VM_BUG_ON_PAGE(!PageLRU(page), page);
|
|
|
|
__ClearPageLRU(page);
|
|
|
|
/* this shouldn't happen, so leave the flags to bad_page() */
|
|
if (PageActive(page) && PageUnevictable(page))
|
|
return;
|
|
|
|
__ClearPageActive(page);
|
|
__ClearPageUnevictable(page);
|
|
}
|
|
|
|
/**
|
|
* page_lru - which LRU list should a page be on?
|
|
* @page: the page to test
|
|
*
|
|
* Returns the LRU list a page should be on, as an index
|
|
* into the array of LRU lists.
|
|
*/
|
|
static __always_inline enum lru_list page_lru(struct page *page)
|
|
{
|
|
enum lru_list lru;
|
|
|
|
VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
|
|
|
|
if (PageUnevictable(page))
|
|
return LRU_UNEVICTABLE;
|
|
|
|
lru = page_is_file_lru(page) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON;
|
|
if (PageActive(page))
|
|
lru += LRU_ACTIVE;
|
|
|
|
return lru;
|
|
}
|
|
|
|
#ifdef CONFIG_LRU_GEN
|
|
|
|
static inline bool lru_gen_enabled(void)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
static inline bool lru_gen_in_fault(void)
|
|
{
|
|
return current->in_lru_fault;
|
|
}
|
|
|
|
static inline int lru_gen_from_seq(unsigned long seq)
|
|
{
|
|
return seq % MAX_NR_GENS;
|
|
}
|
|
|
|
static inline int lru_hist_from_seq(unsigned long seq)
|
|
{
|
|
return seq % NR_HIST_GENS;
|
|
}
|
|
|
|
static inline int lru_tier_from_refs(int refs)
|
|
{
|
|
VM_BUG_ON(refs > BIT(LRU_REFS_WIDTH));
|
|
|
|
/* see the comment on MAX_NR_TIERS */
|
|
return order_base_2(refs + 1);
|
|
}
|
|
|
|
static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
|
|
{
|
|
unsigned long max_seq = lruvec->lrugen.max_seq;
|
|
|
|
VM_BUG_ON(gen >= MAX_NR_GENS);
|
|
|
|
/* see the comment on MIN_NR_GENS */
|
|
return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
|
|
}
|
|
|
|
static inline void lru_gen_update_size(struct lruvec *lruvec, struct page *page,
|
|
int old_gen, int new_gen)
|
|
{
|
|
int type = page_is_file_lru(page);
|
|
int zone = page_zonenum(page);
|
|
int delta = thp_nr_pages(page);
|
|
enum lru_list lru = type * LRU_INACTIVE_FILE;
|
|
struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
|
|
|
VM_BUG_ON(old_gen != -1 && old_gen >= MAX_NR_GENS);
|
|
VM_BUG_ON(new_gen != -1 && new_gen >= MAX_NR_GENS);
|
|
VM_BUG_ON(old_gen == -1 && new_gen == -1);
|
|
|
|
if (old_gen >= 0)
|
|
WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone],
|
|
lrugen->nr_pages[old_gen][type][zone] - delta);
|
|
if (new_gen >= 0)
|
|
WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone],
|
|
lrugen->nr_pages[new_gen][type][zone] + delta);
|
|
|
|
/* addition */
|
|
if (old_gen < 0) {
|
|
if (lru_gen_is_active(lruvec, new_gen))
|
|
lru += LRU_ACTIVE;
|
|
__update_lru_size(lruvec, lru, zone, delta);
|
|
return;
|
|
}
|
|
|
|
/* deletion */
|
|
if (new_gen < 0) {
|
|
if (lru_gen_is_active(lruvec, old_gen))
|
|
lru += LRU_ACTIVE;
|
|
__update_lru_size(lruvec, lru, zone, -delta);
|
|
return;
|
|
}
|
|
|
|
/* promotion */
|
|
if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
|
|
__update_lru_size(lruvec, lru, zone, -delta);
|
|
__update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
|
|
}
|
|
|
|
/* demotion requires isolation, e.g., lru_deactivate_fn() */
|
|
VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
|
|
}
|
|
|
|
static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
|
{
|
|
int gen;
|
|
unsigned long old_flags, new_flags;
|
|
int type = page_is_file_lru(page);
|
|
int zone = page_zonenum(page);
|
|
struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
|
|
|
if (PageUnevictable(page))
|
|
return false;
|
|
/*
|
|
* There are three common cases for this page:
|
|
* 1. If it's hot, e.g., freshly faulted in or previously hot and
|
|
* migrated, add it to the youngest generation.
|
|
* 2. If it's cold but can't be evicted immediately, i.e., an anon page
|
|
* not in swapcache or a dirty page pending writeback, add it to the
|
|
* second oldest generation.
|
|
* 3. Everything else (clean, cold) is added to the oldest generation.
|
|
*/
|
|
if (PageActive(page))
|
|
gen = lru_gen_from_seq(lrugen->max_seq);
|
|
else if ((type == LRU_GEN_ANON && !PageSwapCache(page)) ||
|
|
(PageReclaim(page) && (PageDirty(page) || PageWriteback(page))))
|
|
gen = lru_gen_from_seq(lrugen->min_seq[type] + 1);
|
|
else
|
|
gen = lru_gen_from_seq(lrugen->min_seq[type]);
|
|
|
|
do {
|
|
new_flags = old_flags = READ_ONCE(page->flags);
|
|
VM_BUG_ON_PAGE(new_flags & LRU_GEN_MASK, page);
|
|
|
|
/* see the comment on MIN_NR_GENS */
|
|
new_flags &= ~(LRU_GEN_MASK | BIT(PG_active));
|
|
new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
|
|
} while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
|
|
|
|
lru_gen_update_size(lruvec, page, -1, gen);
|
|
/* for rotate_reclaimable_page() */
|
|
if (reclaiming)
|
|
list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]);
|
|
else
|
|
list_add(&page->lru, &lrugen->lists[gen][type][zone]);
|
|
|
|
return true;
|
|
}
|
|
|
|
static inline bool lru_gen_del_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
|
{
|
|
int gen;
|
|
unsigned long old_flags, new_flags;
|
|
|
|
do {
|
|
new_flags = old_flags = READ_ONCE(page->flags);
|
|
if (!(new_flags & LRU_GEN_MASK))
|
|
return false;
|
|
|
|
VM_BUG_ON_PAGE(PageActive(page), page);
|
|
VM_BUG_ON_PAGE(PageUnevictable(page), page);
|
|
|
|
gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
|
|
|
new_flags &= ~LRU_GEN_MASK;
|
|
if (!(new_flags & BIT(PG_referenced)))
|
|
new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS);
|
|
/* for shrink_page_list() */
|
|
if (reclaiming)
|
|
new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim));
|
|
else if (lru_gen_is_active(lruvec, gen))
|
|
new_flags |= BIT(PG_active);
|
|
} while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
|
|
|
|
lru_gen_update_size(lruvec, page, gen, -1);
|
|
list_del(&page->lru);
|
|
|
|
return true;
|
|
}
|
|
|
|
#else
|
|
|
|
static inline bool lru_gen_enabled(void)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
static inline bool lru_gen_in_fault(void)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
static inline bool lru_gen_del_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
#endif /* CONFIG_LRU_GEN */
|
|
|
|
static __always_inline void add_page_to_lru_list(struct page *page,
|
|
struct lruvec *lruvec)
|
|
{
|
|
enum lru_list lru = page_lru(page);
|
|
|
|
if (lru_gen_add_page(lruvec, page, false))
|
|
return;
|
|
|
|
update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
|
|
list_add(&page->lru, &lruvec->lists[lru]);
|
|
}
|
|
|
|
static __always_inline void add_page_to_lru_list_tail(struct page *page,
|
|
struct lruvec *lruvec)
|
|
{
|
|
enum lru_list lru = page_lru(page);
|
|
|
|
if (lru_gen_add_page(lruvec, page, true))
|
|
return;
|
|
|
|
update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
|
|
list_add_tail(&page->lru, &lruvec->lists[lru]);
|
|
}
|
|
|
|
static __always_inline void del_page_from_lru_list(struct page *page,
|
|
struct lruvec *lruvec)
|
|
{
|
|
if (lru_gen_del_page(lruvec, page, false))
|
|
return;
|
|
|
|
list_del(&page->lru);
|
|
update_lru_size(lruvec, page_lru(page), page_zonenum(page),
|
|
-thp_nr_pages(page));
|
|
}
|
|
|
|
#ifdef CONFIG_ANON_VMA_NAME
|
|
/*
|
|
* mmap_lock should be read-locked when calling anon_vma_name(). Caller should
|
|
* either keep holding the lock while using the returned pointer or it should
|
|
* raise anon_vma_name refcount before releasing the lock.
|
|
*/
|
|
extern struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma);
|
|
extern struct anon_vma_name *anon_vma_name_alloc(const char *name);
|
|
extern void anon_vma_name_free(struct kref *kref);
|
|
|
|
/* mmap_lock should be read-locked */
|
|
static inline void anon_vma_name_get(struct anon_vma_name *anon_name)
|
|
{
|
|
if (anon_name)
|
|
kref_get(&anon_name->kref);
|
|
}
|
|
|
|
static inline void anon_vma_name_put(struct anon_vma_name *anon_name)
|
|
{
|
|
if (anon_name)
|
|
kref_put(&anon_name->kref, anon_vma_name_free);
|
|
}
|
|
|
|
static inline
|
|
struct anon_vma_name *anon_vma_name_reuse(struct anon_vma_name *anon_name)
|
|
{
|
|
/* Prevent anon_name refcount saturation early on */
|
|
if (kref_read(&anon_name->kref) < REFCOUNT_MAX) {
|
|
anon_vma_name_get(anon_name);
|
|
return anon_name;
|
|
|
|
}
|
|
return anon_vma_name_alloc(anon_name->name);
|
|
}
|
|
|
|
static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
|
|
struct vm_area_struct *new_vma)
|
|
{
|
|
struct anon_vma_name *anon_name = anon_vma_name(orig_vma);
|
|
|
|
if (anon_name)
|
|
new_vma->anon_name = anon_vma_name_reuse(anon_name);
|
|
}
|
|
|
|
static inline void free_anon_vma_name(struct vm_area_struct *vma)
|
|
{
|
|
/*
|
|
* Not using anon_vma_name because it generates a warning if mmap_lock
|
|
* is not held, which might be the case here.
|
|
*/
|
|
if (!vma->vm_file)
|
|
anon_vma_name_put(vma->anon_name);
|
|
}
|
|
|
|
static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
|
|
struct anon_vma_name *anon_name2)
|
|
{
|
|
if (anon_name1 == anon_name2)
|
|
return true;
|
|
|
|
return anon_name1 && anon_name2 &&
|
|
!strcmp(anon_name1->name, anon_name2->name);
|
|
}
|
|
#else /* CONFIG_ANON_VMA_NAME */
|
|
static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {}
|
|
static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {}
|
|
static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
|
|
struct vm_area_struct *new_vma) {}
|
|
static inline void free_anon_vma_name(struct vm_area_struct *vma) {}
|
|
|
|
static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
|
|
struct anon_vma_name *anon_name2)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
#endif /* CONFIG_ANON_VMA_NAME */
|
|
|
|
#endif
|