diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index fe88e46ea1d6..b1f35790ee03 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -447,6 +447,7 @@ static inline struct obj_cgroup *__page_objcg(struct page *page) * - LRU isolation * - lock_page_memcg() * - exclusive reference + * - mem_cgroup_trylock_pages() * * For a kmem page a caller should hold an rcu read lock to protect memcg * associated with a kmem page from being released. @@ -502,6 +503,7 @@ static inline struct mem_cgroup *page_memcg_rcu(struct page *page) * - LRU isolation * - lock_page_memcg() * - exclusive reference + * - mem_cgroup_trylock_pages() * * For a kmem page a caller should hold an rcu read lock to protect memcg * associated with a kmem page from being released. @@ -958,6 +960,23 @@ void unlock_page_memcg(struct page *page); void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val); +/* try to stablize page_memcg() for all the pages in a memcg */ +static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) +{ + rcu_read_lock(); + + if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account)) + return true; + + rcu_read_unlock(); + return false; +} + +static inline void mem_cgroup_unlock_pages(void) +{ + rcu_read_unlock(); +} + /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) @@ -1374,6 +1393,18 @@ static inline void unlock_page_memcg(struct page *page) { } +static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) +{ + /* to match page_memcg_rcu() */ + rcu_read_lock(); + return true; +} + +static inline void mem_cgroup_unlock_pages(void) +{ + rcu_read_unlock(); +} + static inline void mem_cgroup_handle_over_high(void) { } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5a226c184afa..b63f7b5ecc9c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -356,6 +356,7 @@ enum lruvec_flags { #ifndef __GENERATING_BOUNDS_H struct lruvec; +struct page_vma_mapped_walk; #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) @@ -411,6 +412,7 @@ struct lru_gen_struct { }; void lru_gen_init_lruvec(struct lruvec *lruvec); +void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); #ifdef CONFIG_MEMCG void lru_gen_init_memcg(struct mem_cgroup *memcg); @@ -423,6 +425,10 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec) { } +static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) +{ +} + #ifdef CONFIG_MEMCG static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) { diff --git a/mm/internal.h b/mm/internal.h index cf3cb933eba3..5c73246a092e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -35,6 +35,7 @@ void page_writeback_init(void); vm_fault_t do_swap_page(struct vm_fault *vmf); +void activate_page(struct page *page); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 52d16881ab81..1505397c3367 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2802,6 +2802,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg) * - LRU isolation * - lock_page_memcg() * - exclusive reference + * - mem_cgroup_trylock_pages() */ page->memcg_data = (unsigned long)memcg; } diff --git a/mm/rmap.c b/mm/rmap.c index 3932a6e9b6ad..df89881c7100 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -73,6 +73,7 @@ #include #include #include +#include #include @@ -797,6 +798,12 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, } if (pvmw.pte) { + if (lru_gen_enabled() && pte_young(*pvmw.pte) && + !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) { + lru_gen_look_around(&pvmw); + referenced++; + } + if (ptep_clear_flush_young_notify(vma, address, pvmw.pte)) { /* diff --git a/mm/swap.c b/mm/swap.c index 5d227577b609..966ff2d83343 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -325,7 +325,7 @@ static bool need_activate_page_drain(int cpu) return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0; } -static void activate_page(struct page *page) +void activate_page(struct page *page) { page = compound_head(page); if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { @@ -345,7 +345,7 @@ static inline void activate_page_drain(int cpu) { } -static void activate_page(struct page *page) +void activate_page(struct page *page) { struct lruvec *lruvec; diff --git a/mm/vmscan.c b/mm/vmscan.c index f3a00fdb1837..8abf8845dc54 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1420,6 +1420,11 @@ retry: if (!sc->may_unmap && page_mapped(page)) goto keep_locked; + /* page_update_gen() tried to promote this page? */ + if (lru_gen_enabled() && !ignore_references && + page_mapped(page) && PageReferenced(page)) + goto keep_locked; + may_enter_fs = (sc->gfp_mask & __GFP_FS) || (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); @@ -3076,6 +3081,29 @@ static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) * the aging ******************************************************************************/ +/* promote pages accessed through page tables */ +static int page_update_gen(struct page *page, int gen) +{ + unsigned long new_flags, old_flags = READ_ONCE(page->flags); + + VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); + VM_WARN_ON_ONCE(!rcu_read_lock_held()); + + do { + /* lru_gen_del_page() has isolated this page? */ + if (!(old_flags & LRU_GEN_MASK)) { + /* for shrink_page_list() */ + new_flags = old_flags | BIT(PG_referenced); + continue; + } + + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); + new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; + } while (!try_cmpxchg(&page->flags, &old_flags, new_flags)); + + return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; +} + /* protect pages accessed multiple times through file descriptors */ static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaiming) { @@ -3087,6 +3115,11 @@ static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaimin VM_WARN_ON_ONCE_PAGE(!(old_flags & LRU_GEN_MASK), page); do { + new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; + /* page_update_gen() has promoted this page? */ + if (new_gen >= 0 && new_gen != old_gen) + return new_gen; + new_gen = (old_gen + 1) % MAX_NR_GENS; new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); @@ -3101,6 +3134,43 @@ static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaimin return new_gen; } +static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr) +{ + unsigned long pfn = pte_pfn(pte); + + VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); + + if (!pte_present(pte) || is_zero_pfn(pfn)) + return -1; + + if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte))) + return -1; + + if (WARN_ON_ONCE(!pfn_valid(pfn))) + return -1; + + return pfn; +} + +static struct page *get_pfn_page(unsigned long pfn, struct mem_cgroup *memcg, + struct pglist_data *pgdat) +{ + struct page *page; + + /* try to avoid unnecessary memory loads */ + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) + return NULL; + + page = compound_head(pfn_to_page(pfn)); + if (page_to_nid(page) != pgdat->node_id) + return NULL; + + if (page_memcg_rcu(page) != memcg) + return NULL; + + return page; +} + static void inc_min_seq(struct lruvec *lruvec, int type) { struct lru_gen_struct *lrugen = &lruvec->lrugen; @@ -3300,6 +3370,114 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); } +/* + * This function exploits spatial locality when shrink_page_list() walks the + * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. + */ +void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) +{ + int i; + pte_t *pte; + unsigned long start; + unsigned long end; + unsigned long addr; + unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; + struct page *page = pvmw->page; + struct mem_cgroup *memcg = page_memcg(page); + struct pglist_data *pgdat = page_pgdat(page); + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + DEFINE_MAX_SEQ(lruvec); + int old_gen, new_gen = lru_gen_from_seq(max_seq); + + lockdep_assert_held(pvmw->ptl); + VM_WARN_ON_ONCE_PAGE(PageLRU(page), page); + + if (spin_is_contended(pvmw->ptl)) + return; + + start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); + end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; + + if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { + if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2) + end = start + MIN_LRU_BATCH * PAGE_SIZE; + else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2) + start = end - MIN_LRU_BATCH * PAGE_SIZE; + else { + start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2; + end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2; + } + } + + pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE; + + rcu_read_lock(); + arch_enter_lazy_mmu_mode(); + + for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { + unsigned long pfn; + + pfn = get_pte_pfn(pte[i], pvmw->vma, addr); + if (pfn == -1) + continue; + + if (!pte_young(pte[i])) + continue; + + page = get_pfn_page(pfn, memcg, pgdat); + if (!page) + continue; + + if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) + VM_WARN_ON_ONCE(true); + + if (pte_dirty(pte[i]) && !PageDirty(page) && + !(PageAnon(page) && PageSwapBacked(page) && + !PageSwapCache(page))) + set_page_dirty(page); + + old_gen = page_lru_gen(page); + if (old_gen < 0) + SetPageReferenced(page); + else if (old_gen != new_gen) + __set_bit(i, bitmap); + } + + arch_leave_lazy_mmu_mode(); + rcu_read_unlock(); + + if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) { + for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { + page = pte_page(pte[i]); + activate_page(page); + } + return; + } + + /* page_update_gen() requires stable page_memcg() */ + if (!mem_cgroup_trylock_pages(memcg)) + return; + + spin_lock_irq(&lruvec->lru_lock); + new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq); + + for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { + page = compound_head(pte_page(pte[i])); + if (page_memcg_rcu(page) != memcg) + continue; + + old_gen = page_update_gen(page, new_gen); + if (old_gen < 0 || old_gen == new_gen) + continue; + + lru_gen_update_size(lruvec, page, old_gen, new_gen); + } + + spin_unlock_irq(&lruvec->lru_lock); + + mem_cgroup_unlock_pages(); +} + /****************************************************************************** * the eviction ******************************************************************************/ @@ -3336,6 +3514,12 @@ static bool sort_page(struct lruvec *lruvec, struct page *page, int tier_idx) return true; } + /* promoted */ + if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { + list_move(&page->lru, &lrugen->lists[gen][type][zone]); + return true; + } + /* protected */ if (tier > tier_idx) { int hist = lru_hist_from_seq(lrugen->min_seq[type]);