diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h index 0660a03d37d9..9e25283d6fc9 100644 --- a/include/linux/hugetlb_inline.h +++ b/include/linux/hugetlb_inline.h @@ -8,7 +8,7 @@ static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) { - return !!(vma->vm_flags & VM_HUGETLB); + return !!(READ_ONCE(vma->vm_flags) & VM_HUGETLB); } #else diff --git a/include/linux/mm.h b/include/linux/mm.h index 1cf6e5d14c63..fe192fbedf3e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -465,6 +465,8 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_REMOTE 0x80 #define FAULT_FLAG_INSTRUCTION 0x100 #define FAULT_FLAG_INTERRUPTIBLE 0x200 +/* Speculative fault, not holding mmap_sem */ +#define FAULT_FLAG_SPECULATIVE 0x400 /* * The default fault flags that should be used by most of the @@ -520,6 +522,10 @@ struct vm_fault { gfp_t gfp_mask; /* gfp mask to be used for allocations */ pgoff_t pgoff; /* Logical page offset based on vma */ unsigned long address; /* Faulting virtual address */ +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + unsigned int sequence; + pmd_t orig_pmd; /* value of PMD at the time of fault */ +#endif pmd_t *pmd; /* Pointer to pmd entry matching * the 'address' */ pud_t *pud; /* Pointer to pud entry matching @@ -1760,6 +1766,31 @@ extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma, extern int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked); + +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT +extern int __handle_speculative_fault(struct mm_struct *mm, + unsigned long address, + unsigned int flags); +static inline int handle_speculative_fault(struct mm_struct *mm, + unsigned long address, + unsigned int flags) +{ + /* + * Try speculative page fault for multithreaded user space task only. + */ + if (!(flags & FAULT_FLAG_USER) || atomic_read(&mm->mm_users) == 1) + return VM_FAULT_RETRY; + return __handle_speculative_fault(mm, address, flags); +} +#else +static inline int handle_speculative_fault(struct mm_struct *mm, + unsigned long address, + unsigned int flags) +{ + return VM_FAULT_RETRY; +} +#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ + void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows); void unmap_mapping_range(struct address_space *mapping, diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 14da3ab2cfa2..8b921e9cd777 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -552,8 +552,8 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma, pgoff_t pgoff; if (unlikely(is_vm_hugetlb_page(vma))) return linear_hugepage_index(vma, address); - pgoff = (address - vma->vm_start) >> PAGE_SHIFT; - pgoff += vma->vm_pgoff; + pgoff = (address - READ_ONCE(vma->vm_start)) >> PAGE_SHIFT; + pgoff += READ_ONCE(vma->vm_pgoff); return pgoff; } diff --git a/mm/internal.h b/mm/internal.h index 4b156e129def..ea3e96c7883c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -40,7 +40,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf); extern struct vm_area_struct *get_vma(struct mm_struct *mm, unsigned long addr); extern void put_vma(struct vm_area_struct *vma); -#endif + +static inline bool vma_has_changed(struct vm_fault *vmf) +{ + int ret = RB_EMPTY_NODE(&vmf->vma->vm_rb); + unsigned int seq = READ_ONCE(vmf->vma->vm_sequence.sequence); + + /* + * Matches both the wmb in write_seqlock_{begin,end}() and + * the wmb in vma_rb_erase(). + */ + smp_rmb(); + + return ret || seq != vmf->sequence; +} +#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); diff --git a/mm/memory.c b/mm/memory.c index 019563c5d513..d94177b6b440 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -557,7 +557,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, if (page) dump_page(page, "bad pte"); pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n", - (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); + (void *)addr, READ_ONCE(vma->vm_flags), vma->anon_vma, mapping, index); pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n", vma->vm_file, vma->vm_ops ? vma->vm_ops->fault : NULL, @@ -2572,6 +2572,113 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL_GPL(apply_to_page_range); +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT +static bool pte_spinlock(struct vm_fault *vmf) +{ + bool ret = false; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + pmd_t pmdval; +#endif + + /* Check if vma is still valid */ + if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) { + vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); + spin_lock(vmf->ptl); + return true; + } + + local_irq_disable(); + if (vma_has_changed(vmf)) + goto out; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + /* + * We check if the pmd value is still the same to ensure that there + * is not a huge collapse operation in progress in our back. + */ + pmdval = READ_ONCE(*vmf->pmd); + if (!pmd_same(pmdval, vmf->orig_pmd)) + goto out; +#endif + + vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); + if (unlikely(!spin_trylock(vmf->ptl))) + goto out; + + if (vma_has_changed(vmf)) { + spin_unlock(vmf->ptl); + goto out; + } + + ret = true; +out: + local_irq_enable(); + return ret; +} + +static bool pte_map_lock(struct vm_fault *vmf) +{ + bool ret = false; + pte_t *pte; + spinlock_t *ptl; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + pmd_t pmdval; +#endif + + if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) { + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + return true; + } + + /* + * The first vma_has_changed() guarantees the page-tables are still + * valid, having IRQs disabled ensures they stay around, hence the + * second vma_has_changed() to make sure they are still valid once + * we've got the lock. After that a concurrent zap_pte_range() will + * block on the PTL and thus we're safe. + */ + local_irq_disable(); + if (vma_has_changed(vmf)) + goto out; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + /* + * We check if the pmd value is still the same to ensure that there + * is not a huge collapse operation in progress in our back. + */ + pmdval = READ_ONCE(*vmf->pmd); + if (!pmd_same(pmdval, vmf->orig_pmd)) + goto out; +#endif + + /* + * Same as pte_offset_map_lock() except that we call + * spin_trylock() in place of spin_lock() to avoid race with + * unmap path which may have the lock and wait for this CPU + * to invalidate TLB but this CPU has irq disabled. + * Since we are in a speculative patch, accept it could fail + */ + ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); + pte = pte_offset_map(vmf->pmd, vmf->address); + if (unlikely(!spin_trylock(ptl))) { + pte_unmap(pte); + goto out; + } + + if (vma_has_changed(vmf)) { + pte_unmap_unlock(pte, ptl); + goto out; + } + + vmf->pte = pte; + vmf->ptl = ptl; + ret = true; +out: + local_irq_enable(); + return ret; +} +#else static inline bool pte_spinlock(struct vm_fault *vmf) { vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); @@ -2585,6 +2692,7 @@ static inline bool pte_map_lock(struct vm_fault *vmf) vmf->address, &vmf->ptl); return true; } +#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ /* * Scan a region of virtual memory, calling a provided function on @@ -3591,6 +3699,14 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) ret = check_stable_address_space(vma->vm_mm); if (ret) goto unlock; + /* + * Don't call the userfaultfd during the speculative path. + * We already checked for the VMA to not be managed through + * userfaultfd, but it may be set in our back once we have lock + * the pte. In such a case we can ignore it this time. + */ + if (vmf->flags & FAULT_FLAG_SPECULATIVE) + goto setpte; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -3637,7 +3753,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) goto unlock_and_release; /* Deliver the page fault to userland, check inside PT lock */ - if (userfaultfd_missing(vma)) { + if (!(vmf->flags & FAULT_FLAG_SPECULATIVE) && + userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); put_page(page); return handle_userfault(vmf, VM_UFFD_MISSING); @@ -4421,6 +4538,15 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) pte_t entry; if (unlikely(pmd_none(*vmf->pmd))) { + /* + * In the case of the speculative page fault handler we abort + * the speculative path immediately as the pmd is probably + * in the way to be converted in a huge one. We will try + * again holding the mmap_sem (which implies that the collapse + * operation is done). + */ + if (vmf->flags & FAULT_FLAG_SPECULATIVE) + return VM_FAULT_RETRY; /* * Leave __pte_alloc() until later: because vm_ops->fault may * want to allocate huge page, and if we expose page table @@ -4428,7 +4554,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) * concurrent faults and from rmap lookups. */ vmf->pte = NULL; - } else { + } else if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) { /* See comment in pte_alloc_one_map() */ if (pmd_devmap_trans_unstable(vmf->pmd)) return 0; @@ -4437,6 +4563,9 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) * pmd from under us anymore at this point because we hold the * mmap_lock read mode and khugepaged takes it in write mode. * So now it's safe to run pte_offset_map(). + * This is not applicable to the speculative page fault handler + * but in that case, the pte is fetched earlier in + * handle_speculative_fault(). */ vmf->pte = pte_offset_map(vmf->pmd, vmf->address); vmf->orig_pte = *vmf->pte; @@ -4459,6 +4588,8 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) if (!vmf->pte) { if (vma_is_anonymous(vmf->vma)) return do_anonymous_page(vmf); + else if (vmf->flags & FAULT_FLAG_SPECULATIVE) + return VM_FAULT_RETRY; else return do_fault(vmf); } @@ -4567,6 +4698,9 @@ retry_pud: if (pud_trans_unstable(vmf.pud)) goto retry_pud; +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + vmf.sequence = raw_read_seqcount(&vma->vm_sequence); +#endif if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) @@ -4660,6 +4794,206 @@ static inline void mm_account_fault(struct pt_regs *regs, else perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + +#ifndef CONFIG_ARCH_HAS_PTE_SPECIAL +/* This is required by vm_normal_page() */ +#error "Speculative page fault handler requires CONFIG_ARCH_HAS_PTE_SPECIAL" +#endif + +/* + * vm_normal_page() adds some processing which should be done while + * hodling the mmap_sem. + */ +int __handle_speculative_fault(struct mm_struct *mm, unsigned long address, + unsigned int flags) +{ + struct vm_fault vmf = { + .address = address, + }; + pgd_t *pgd, pgdval; + p4d_t *p4d, p4dval; + pud_t pudval; + int seq, ret = VM_FAULT_RETRY; + struct vm_area_struct *vma; + + /* Clear flags that may lead to release the mmap_sem to retry */ + flags &= ~(FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_KILLABLE); + flags |= FAULT_FLAG_SPECULATIVE; + + vma = get_vma(mm, address); + if (!vma) + return ret; + + /* rmb <-> seqlock,vma_rb_erase() */ + seq = raw_read_seqcount(&vma->vm_sequence); + if (seq & 1) + goto out_put; + + /* + * Can't call vm_ops service has we don't know what they would do + * with the VMA. + * This include huge page from hugetlbfs. + */ + if (vma->vm_ops) + goto out_put; + + /* + * __anon_vma_prepare() requires the mmap_sem to be held + * because vm_next and vm_prev must be safe. This can't be guaranteed + * in the speculative path. + */ + if (unlikely(!vma->anon_vma)) + goto out_put; + + vmf.vma_flags = READ_ONCE(vma->vm_flags); + vmf.vma_page_prot = READ_ONCE(vma->vm_page_prot); + + /* Can't call userland page fault handler in the speculative path */ + if (unlikely(vmf.vma_flags & VM_UFFD_MISSING)) + goto out_put; + + if (vmf.vma_flags & VM_GROWSDOWN || vmf.vma_flags & VM_GROWSUP) + /* + * This could be detected by the check address against VMA's + * boundaries but we want to trace it as not supported instead + * of changed. + */ + goto out_put; + + if (address < READ_ONCE(vma->vm_start) + || READ_ONCE(vma->vm_end) <= address) + goto out_put; + + if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, + flags & FAULT_FLAG_INSTRUCTION, + flags & FAULT_FLAG_REMOTE)) { + ret = VM_FAULT_SIGSEGV; + goto out_put; + } + + /* This is one is required to check that the VMA has write access set */ + if (flags & FAULT_FLAG_WRITE) { + if (unlikely(!(vmf.vma_flags & VM_WRITE))) { + ret = VM_FAULT_SIGSEGV; + goto out_put; + } + } else if (unlikely(!(vmf.vma_flags & (VM_READ|VM_EXEC|VM_WRITE)))) { + ret = VM_FAULT_SIGSEGV; + goto out_put; + } + +#ifdef CONFIG_NUMA + struct mempolicy *pol; + + /* + * MPOL_INTERLEAVE implies additional checks in + * mpol_misplaced() which are not compatible with the + *speculative page fault processing. + */ + pol = __get_vma_policy(vma, address); + if (!pol) + pol = get_task_policy(current); + if (pol && pol->mode == MPOL_INTERLEAVE) + goto out_put; +#endif + + /* + * Do a speculative lookup of the PTE entry. + */ + local_irq_disable(); + pgd = pgd_offset(mm, address); + pgdval = READ_ONCE(*pgd); + if (pgd_none(pgdval) || unlikely(pgd_bad(pgdval))) + goto out_walk; + + p4d = p4d_offset(pgd, address); + p4dval = READ_ONCE(*p4d); + if (p4d_none(p4dval) || unlikely(p4d_bad(p4dval))) + goto out_walk; + + vmf.pud = pud_offset(p4d, address); + pudval = READ_ONCE(*vmf.pud); + if (pud_none(pudval) || unlikely(pud_bad(pudval))) + goto out_walk; + + /* Huge pages at PUD level are not supported. */ + if (unlikely(pud_trans_huge(pudval))) + goto out_walk; + + vmf.pmd = pmd_offset(vmf.pud, address); + vmf.orig_pmd = READ_ONCE(*vmf.pmd); + /* + * pmd_none could mean that a hugepage collapse is in progress + * in our back as collapse_huge_page() mark it before + * invalidating the pte (which is done once the IPI is catched + * by all CPU and we have interrupt disabled). + * For this reason we cannot handle THP in a speculative way since we + * can't safely indentify an in progress collapse operation done in our + * back on that PMD. + * Regarding the order of the following checks, see comment in + * pmd_devmap_trans_unstable() + */ + if (unlikely(pmd_devmap(vmf.orig_pmd) || + pmd_none(vmf.orig_pmd) || pmd_trans_huge(vmf.orig_pmd) || + is_swap_pmd(vmf.orig_pmd))) + goto out_walk; + + /* + * The above does not allocate/instantiate page-tables because doing so + * would lead to the possibility of instantiating page-tables after + * free_pgtables() -- and consequently leaking them. + * + * The result is that we take at least one !speculative fault per PMD + * in order to instantiate it. + */ + + vmf.pte = pte_offset_map(vmf.pmd, address); + vmf.orig_pte = READ_ONCE(*vmf.pte); + barrier(); /* See comment in handle_pte_fault() */ + if (pte_none(vmf.orig_pte)) { + pte_unmap(vmf.pte); + vmf.pte = NULL; + } + + vmf.vma = vma; + vmf.pgoff = linear_page_index(vma, address); + vmf.gfp_mask = __get_fault_gfp_mask(vma); + vmf.sequence = seq; + vmf.flags = flags; + + local_irq_enable(); + + /* + * We need to re-validate the VMA after checking the bounds, otherwise + * we might have a false positive on the bounds. + */ + if (read_seqcount_retry(&vma->vm_sequence, seq)) + goto out_put; + + mem_cgroup_enter_user_fault(); + ret = handle_pte_fault(&vmf); + mem_cgroup_exit_user_fault(); + + put_vma(vma); + + /* + * The task may have entered a memcg OOM situation but + * if the allocation error was handled gracefully (no + * VM_FAULT_OOM), there is no need to kill anything. + * Just clean up the OOM state peacefully. + */ + if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) + mem_cgroup_oom_synchronize(false); + return ret; + +out_walk: + local_irq_enable(); +out_put: + put_vma(vma); + return ret; +} +#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ /* * By the time we get here, we already hold the mm semaphore