Evictable pages are divided into multiple generations for each lruvec. The youngest generation number is stored in lrugen->max_seq for both anon and file types as they are aged on an equal footing. The oldest generation numbers are stored in lrugen->min_seq[] separately for anon and file types as clean file pages can be evicted regardless of swap constraints. These three variables are monotonically increasing. Generation numbers are truncated into order_base_2(MAX_NR_GENS+1) bits in order to fit into the gen counter in page->flags. Each truncated generation number is an index to lrugen->lists[]. The sliding window technique is used to track at least MIN_NR_GENS and at most MAX_NR_GENS generations. The gen counter stores a value within [1, MAX_NR_GENS] while a page is on one of lrugen->lists[]. Otherwise it stores 0. There are two conceptually independent procedures: "the aging", which produces young generations, and "the eviction", which consumes old generations. They form a closed-loop system, i.e., "the page reclaim". Both procedures can be invoked from userspace for the purposes of working set estimation and proactive reclaim. These techniques are commonly used to optimize job scheduling (bin packing) in data centers [1][2]. To avoid confusion, the terms "hot" and "cold" will be applied to the multi-gen LRU, as a new convention; the terms "active" and "inactive" will be applied to the active/inactive LRU, as usual. The protection of hot pages and the selection of cold pages are based on page access channels and patterns. There are two access channels: one through page tables and the other through file descriptors. The protection of the former channel is by design stronger because: 1. The uncertainty in determining the access patterns of the former channel is higher due to the approximation of the accessed bit. 2. The cost of evicting the former channel is higher due to the TLB flushes required and the likelihood of encountering the dirty bit. 3. The penalty of underprotecting the former channel is higher because applications usually do not prepare themselves for major page faults like they do for blocked I/O. E.g., GUI applications commonly use dedicated I/O threads to avoid blocking rendering threads. There are also two access patterns: one with temporal locality and the other without. For the reasons listed above, the former channel is assumed to follow the former pattern unless VM_SEQ_READ or VM_RAND_READ is present; the latter channel is assumed to follow the latter pattern unless outlying refaults have been observed [3][4]. The next patch will address the "outlying refaults". Three macros, i.e., LRU_REFS_WIDTH, LRU_REFS_PGOFF and LRU_REFS_MASK, used later are added in this patch to make the entire patchset less diffy. A page is added to the youngest generation on faulting. The aging needs to check the accessed bit at least twice before handing this page over to the eviction. The first check takes care of the accessed bit set on the initial fault; the second check makes sure this page has not been used since then. This protocol, AKA second chance, requires a minimum of two generations, hence MIN_NR_GENS. [1] https://dl.acm.org/doi/10.1145/3297858.3304053 [2] https://dl.acm.org/doi/10.1145/3503222.3507731 [3] https://lwn.net/Articles/495543/ [4] https://lwn.net/Articles/815342/ Link: https://lkml.kernel.org/r/20220918080010.2920238-6-yuzhao@google.com Change-Id: I7b24d1e9d263e4eb2c2ee23f2eb143824fcb5201 Signed-off-by: Yu Zhao <yuzhao@google.com> Acked-by: Brian Geffon <bgeffon@google.com> Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org> Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name> Acked-by: Steven Barrett <steven@liquorix.net> Acked-by: Suleiman Souhlal <suleiman@google.com> Tested-by: Daniel Byrne <djbyrne@mtu.edu> Tested-by: Donald Carr <d@chaos-reins.com> Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com> Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> Tested-by: Shuang Zhai <szhai2@cs.rochester.edu> Tested-by: Sofia Trinh <sofia.trinh@edi.works> Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> Cc: Barry Song <baohua@kernel.org> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Hillf Danton <hdanton@sina.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Matthew Wilcox <willy@infradead.org> Cc: Mel Gorman <mgorman@suse.de> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Michael Larabel <Michael@MichaelLarabel.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Rapoport <rppt@kernel.org> Cc: Mike Rapoport <rppt@linux.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Qi Zheng <zhengqi.arch@bytedance.com> Cc: Tejun Heo <tj@kernel.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> (cherry picked from commit ec1c86b25f4bdd9dce6436c0539d2a6ae676e1c4) [ Resolve conflicts in mm/memory.c, mm/memcontrol.c, mm/Kconfig, include/linux/mm_inline.h] Bug: 249601646 Signed-off-by: Kalesh Singh <kaleshsingh@google.com>
380 lines
10 KiB
C
380 lines
10 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef LINUX_MM_INLINE_H
|
|
#define LINUX_MM_INLINE_H
|
|
|
|
#include <linux/huge_mm.h>
|
|
#include <linux/swap.h>
|
|
#include <linux/string.h>
|
|
|
|
/**
|
|
* page_is_file_lru - should the page be on a file LRU or anon LRU?
|
|
* @page: the page to test
|
|
*
|
|
* Returns 1 if @page is a regular filesystem backed page cache page or a lazily
|
|
* freed anonymous page (e.g. via MADV_FREE). Returns 0 if @page is a normal
|
|
* anonymous page, a tmpfs page or otherwise ram or swap backed page. Used by
|
|
* functions that manipulate the LRU lists, to sort a page onto the right LRU
|
|
* list.
|
|
*
|
|
* We would like to get this info without a page flag, but the state
|
|
* needs to survive until the page is last deleted from the LRU, which
|
|
* could be as far down as __page_cache_release.
|
|
*/
|
|
static inline int page_is_file_lru(struct page *page)
|
|
{
|
|
return !PageSwapBacked(page);
|
|
}
|
|
|
|
static __always_inline void __update_lru_size(struct lruvec *lruvec,
|
|
enum lru_list lru, enum zone_type zid,
|
|
long nr_pages)
|
|
{
|
|
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
|
|
|
lockdep_assert_held(&lruvec->lru_lock);
|
|
WARN_ON_ONCE(nr_pages != (int)nr_pages);
|
|
|
|
__mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
|
|
__mod_zone_page_state(&pgdat->node_zones[zid],
|
|
NR_ZONE_LRU_BASE + lru, nr_pages);
|
|
}
|
|
|
|
static __always_inline void update_lru_size(struct lruvec *lruvec,
|
|
enum lru_list lru, enum zone_type zid,
|
|
long nr_pages)
|
|
{
|
|
__update_lru_size(lruvec, lru, zid, nr_pages);
|
|
#ifdef CONFIG_MEMCG
|
|
mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
|
|
#endif
|
|
}
|
|
|
|
/**
|
|
* __clear_page_lru_flags - clear page lru flags before releasing a page
|
|
* @page: the page that was on lru and now has a zero reference
|
|
*/
|
|
static __always_inline void __clear_page_lru_flags(struct page *page)
|
|
{
|
|
VM_BUG_ON_PAGE(!PageLRU(page), page);
|
|
|
|
__ClearPageLRU(page);
|
|
|
|
/* this shouldn't happen, so leave the flags to bad_page() */
|
|
if (PageActive(page) && PageUnevictable(page))
|
|
return;
|
|
|
|
__ClearPageActive(page);
|
|
__ClearPageUnevictable(page);
|
|
}
|
|
|
|
/**
|
|
* page_lru - which LRU list should a page be on?
|
|
* @page: the page to test
|
|
*
|
|
* Returns the LRU list a page should be on, as an index
|
|
* into the array of LRU lists.
|
|
*/
|
|
static __always_inline enum lru_list page_lru(struct page *page)
|
|
{
|
|
enum lru_list lru;
|
|
|
|
VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
|
|
|
|
if (PageUnevictable(page))
|
|
return LRU_UNEVICTABLE;
|
|
|
|
lru = page_is_file_lru(page) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON;
|
|
if (PageActive(page))
|
|
lru += LRU_ACTIVE;
|
|
|
|
return lru;
|
|
}
|
|
|
|
#ifdef CONFIG_LRU_GEN
|
|
|
|
static inline bool lru_gen_enabled(void)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
static inline bool lru_gen_in_fault(void)
|
|
{
|
|
return current->in_lru_fault;
|
|
}
|
|
|
|
static inline int lru_gen_from_seq(unsigned long seq)
|
|
{
|
|
return seq % MAX_NR_GENS;
|
|
}
|
|
|
|
static inline int page_lru_gen(struct page *page)
|
|
{
|
|
unsigned long flags = READ_ONCE(page->flags);
|
|
|
|
return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
|
}
|
|
|
|
static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
|
|
{
|
|
unsigned long max_seq = lruvec->lrugen.max_seq;
|
|
|
|
VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
|
|
|
|
/* see the comment on MIN_NR_GENS */
|
|
return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
|
|
}
|
|
|
|
static inline void lru_gen_update_size(struct lruvec *lruvec, struct page *page,
|
|
int old_gen, int new_gen)
|
|
{
|
|
int type = page_is_file_lru(page);
|
|
int zone = page_zonenum(page);
|
|
int delta = thp_nr_pages(page);
|
|
enum lru_list lru = type * LRU_INACTIVE_FILE;
|
|
struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
|
|
|
VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
|
|
VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
|
|
VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1);
|
|
|
|
if (old_gen >= 0)
|
|
WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone],
|
|
lrugen->nr_pages[old_gen][type][zone] - delta);
|
|
if (new_gen >= 0)
|
|
WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone],
|
|
lrugen->nr_pages[new_gen][type][zone] + delta);
|
|
|
|
/* addition */
|
|
if (old_gen < 0) {
|
|
if (lru_gen_is_active(lruvec, new_gen))
|
|
lru += LRU_ACTIVE;
|
|
__update_lru_size(lruvec, lru, zone, delta);
|
|
return;
|
|
}
|
|
|
|
/* deletion */
|
|
if (new_gen < 0) {
|
|
if (lru_gen_is_active(lruvec, old_gen))
|
|
lru += LRU_ACTIVE;
|
|
__update_lru_size(lruvec, lru, zone, -delta);
|
|
return;
|
|
}
|
|
}
|
|
|
|
static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
|
{
|
|
unsigned long seq;
|
|
unsigned long flags;
|
|
int gen = page_lru_gen(page);
|
|
int type = page_is_file_lru(page);
|
|
int zone = page_zonenum(page);
|
|
struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
|
|
|
VM_WARN_ON_ONCE_PAGE(gen != -1, page);
|
|
|
|
if (PageUnevictable(page))
|
|
return false;
|
|
/*
|
|
* There are three common cases for this page:
|
|
* 1. If it's hot, e.g., freshly faulted in or previously hot and
|
|
* migrated, add it to the youngest generation.
|
|
* 2. If it's cold but can't be evicted immediately, i.e., an anon page
|
|
* not in swapcache or a dirty page pending writeback, add it to the
|
|
* second oldest generation.
|
|
* 3. Everything else (clean, cold) is added to the oldest generation.
|
|
*/
|
|
if (PageActive(page))
|
|
seq = lrugen->max_seq;
|
|
else if ((type == LRU_GEN_ANON && !PageSwapCache(page)) ||
|
|
(PageReclaim(page) &&
|
|
(PageDirty(page) || PageWriteback(page))))
|
|
seq = lrugen->min_seq[type] + 1;
|
|
else
|
|
seq = lrugen->min_seq[type];
|
|
|
|
gen = lru_gen_from_seq(seq);
|
|
flags = (gen + 1UL) << LRU_GEN_PGOFF;
|
|
/* see the comment on MIN_NR_GENS about PG_active */
|
|
set_mask_bits(&page->flags, LRU_GEN_MASK | BIT(PG_active), flags);
|
|
|
|
lru_gen_update_size(lruvec, page, -1, gen);
|
|
/* for rotate_reclaimable_page() */
|
|
if (reclaiming)
|
|
list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]);
|
|
else
|
|
list_add(&page->lru, &lrugen->lists[gen][type][zone]);
|
|
|
|
return true;
|
|
}
|
|
|
|
static inline bool lru_gen_del_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
|
{
|
|
unsigned long flags;
|
|
int gen = page_lru_gen(page);
|
|
|
|
if (gen < 0)
|
|
return false;
|
|
|
|
VM_WARN_ON_ONCE_PAGE(PageActive(page), page);
|
|
VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
|
|
|
|
/* for migrate_page_states() */
|
|
flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0;
|
|
flags = set_mask_bits(&page->flags, LRU_GEN_MASK, flags);
|
|
gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
|
|
|
lru_gen_update_size(lruvec, page, gen, -1);
|
|
list_del(&page->lru);
|
|
|
|
return true;
|
|
}
|
|
|
|
#else /* !CONFIG_LRU_GEN */
|
|
|
|
static inline bool lru_gen_enabled(void)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
static inline bool lru_gen_in_fault(void)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
static inline bool lru_gen_del_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
#endif /* CONFIG_LRU_GEN */
|
|
|
|
static __always_inline void add_page_to_lru_list(struct page *page,
|
|
struct lruvec *lruvec)
|
|
{
|
|
enum lru_list lru = page_lru(page);
|
|
|
|
if (lru_gen_add_page(lruvec, page, false))
|
|
return;
|
|
|
|
update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
|
|
list_add(&page->lru, &lruvec->lists[lru]);
|
|
}
|
|
|
|
static __always_inline void add_page_to_lru_list_tail(struct page *page,
|
|
struct lruvec *lruvec)
|
|
{
|
|
enum lru_list lru = page_lru(page);
|
|
|
|
if (lru_gen_add_page(lruvec, page, true))
|
|
return;
|
|
|
|
update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
|
|
list_add_tail(&page->lru, &lruvec->lists[lru]);
|
|
}
|
|
|
|
static __always_inline void del_page_from_lru_list(struct page *page,
|
|
struct lruvec *lruvec)
|
|
{
|
|
if (lru_gen_del_page(lruvec, page, false))
|
|
return;
|
|
|
|
list_del(&page->lru);
|
|
update_lru_size(lruvec, page_lru(page), page_zonenum(page),
|
|
-thp_nr_pages(page));
|
|
}
|
|
|
|
#ifdef CONFIG_ANON_VMA_NAME
|
|
/*
|
|
* mmap_lock should be read-locked when calling anon_vma_name(). Caller should
|
|
* either keep holding the lock while using the returned pointer or it should
|
|
* raise anon_vma_name refcount before releasing the lock.
|
|
*/
|
|
extern struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma);
|
|
extern struct anon_vma_name *anon_vma_name_alloc(const char *name);
|
|
extern void anon_vma_name_free(struct kref *kref);
|
|
|
|
/* mmap_lock should be read-locked */
|
|
static inline void anon_vma_name_get(struct anon_vma_name *anon_name)
|
|
{
|
|
if (anon_name)
|
|
kref_get(&anon_name->kref);
|
|
}
|
|
|
|
static inline void anon_vma_name_put(struct anon_vma_name *anon_name)
|
|
{
|
|
if (anon_name)
|
|
kref_put(&anon_name->kref, anon_vma_name_free);
|
|
}
|
|
|
|
static inline
|
|
struct anon_vma_name *anon_vma_name_reuse(struct anon_vma_name *anon_name)
|
|
{
|
|
/* Prevent anon_name refcount saturation early on */
|
|
if (kref_read(&anon_name->kref) < REFCOUNT_MAX) {
|
|
anon_vma_name_get(anon_name);
|
|
return anon_name;
|
|
|
|
}
|
|
return anon_vma_name_alloc(anon_name->name);
|
|
}
|
|
|
|
static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
|
|
struct vm_area_struct *new_vma)
|
|
{
|
|
struct anon_vma_name *anon_name = anon_vma_name(orig_vma);
|
|
|
|
if (anon_name)
|
|
new_vma->anon_name = anon_vma_name_reuse(anon_name);
|
|
}
|
|
|
|
static inline void free_anon_vma_name(struct vm_area_struct *vma)
|
|
{
|
|
/*
|
|
* Not using anon_vma_name because it generates a warning if mmap_lock
|
|
* is not held, which might be the case here.
|
|
*/
|
|
if (!vma->vm_file)
|
|
anon_vma_name_put(vma->anon_name);
|
|
}
|
|
|
|
static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
|
|
struct anon_vma_name *anon_name2)
|
|
{
|
|
if (anon_name1 == anon_name2)
|
|
return true;
|
|
|
|
return anon_name1 && anon_name2 &&
|
|
!strcmp(anon_name1->name, anon_name2->name);
|
|
}
|
|
#else /* CONFIG_ANON_VMA_NAME */
|
|
static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {}
|
|
static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {}
|
|
static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
|
|
struct vm_area_struct *new_vma) {}
|
|
static inline void free_anon_vma_name(struct vm_area_struct *vma) {}
|
|
|
|
static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
|
|
struct anon_vma_name *anon_name2)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
#endif /* CONFIG_ANON_VMA_NAME */
|
|
|
|
#endif
|