userfaultfd: add UFFDIO_CONTINUE ioctl
This ioctl is how userspace ought to resolve "minor" userfaults. The
idea is, userspace is notified that a minor fault has occurred. It
might change the contents of the page using its second non-UFFD mapping,
or not. Then, it calls UFFDIO_CONTINUE to tell the kernel "I have
ensured the page contents are correct, carry on setting up the mapping".
Note that it doesn't make much sense to use UFFDIO_{COPY,ZEROPAGE} for
MINOR registered VMAs. ZEROPAGE maps the VMA to the zero page; but in
the minor fault case, we already have some pre-existing underlying page.
Likewise, UFFDIO_COPY isn't useful if we have a second non-UFFD mapping.
We'd just use memcpy() or similar instead.
It turns out hugetlb_mcopy_atomic_pte() already does very close to what
we want, if an existing page is provided via `struct page **pagep`. We
already special-case the behavior a bit for the UFFDIO_ZEROPAGE case, so
just extend that design: add an enum for the three modes of operation,
and make the small adjustments needed for the MCOPY_ATOMIC_CONTINUE
case. (Basically, look up the existing page, and avoid adding the
existing page to the page cache or calling set_page_huge_active() on
it.)
Link: https://lkml.kernel.org/r/20210301222728.176417-5-axelrasmussen@google.com
Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Cc: Adam Ruprecht <ruprecht@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Cannon Matthews <cannonmatthews@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chinwen Chang <chinwen.chang@mediatek.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Dr . David Alan Gilbert" <dgilbert@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michal Koutn" <mkoutny@suse.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oliver Upton <oupton@google.com>
Cc: Shaohua Li <shli@fb.com>
Cc: Shawn Anastasio <shawn@anastas.io>
Cc: Steven Price <steven.price@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
committed by
Linus Torvalds
parent
714c189108
commit
f619147104
@@ -207,7 +207,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
|
||||
unsigned long dst_start,
|
||||
unsigned long src_start,
|
||||
unsigned long len,
|
||||
bool zeropage)
|
||||
enum mcopy_atomic_mode mode)
|
||||
{
|
||||
int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
|
||||
int vm_shared = dst_vma->vm_flags & VM_SHARED;
|
||||
@@ -227,7 +227,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
|
||||
* by THP. Since we can not reliably insert a zero page, this
|
||||
* feature is not supported.
|
||||
*/
|
||||
if (zeropage) {
|
||||
if (mode == MCOPY_ATOMIC_ZEROPAGE) {
|
||||
mmap_read_unlock(dst_mm);
|
||||
return -EINVAL;
|
||||
}
|
||||
@@ -273,8 +273,6 @@ retry:
|
||||
}
|
||||
|
||||
while (src_addr < src_start + len) {
|
||||
pte_t dst_pteval;
|
||||
|
||||
BUG_ON(dst_addr >= dst_start + len);
|
||||
|
||||
/*
|
||||
@@ -297,16 +295,16 @@ retry:
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
err = -EEXIST;
|
||||
dst_pteval = huge_ptep_get(dst_pte);
|
||||
if (!huge_pte_none(dst_pteval)) {
|
||||
if (mode != MCOPY_ATOMIC_CONTINUE &&
|
||||
!huge_pte_none(huge_ptep_get(dst_pte))) {
|
||||
err = -EEXIST;
|
||||
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||
i_mmap_unlock_read(mapping);
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
|
||||
dst_addr, src_addr, &page);
|
||||
dst_addr, src_addr, mode, &page);
|
||||
|
||||
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||
i_mmap_unlock_read(mapping);
|
||||
@@ -408,7 +406,7 @@ extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
|
||||
unsigned long dst_start,
|
||||
unsigned long src_start,
|
||||
unsigned long len,
|
||||
bool zeropage);
|
||||
enum mcopy_atomic_mode mode);
|
||||
#endif /* CONFIG_HUGETLB_PAGE */
|
||||
|
||||
static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
|
||||
@@ -458,7 +456,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
|
||||
unsigned long dst_start,
|
||||
unsigned long src_start,
|
||||
unsigned long len,
|
||||
bool zeropage,
|
||||
enum mcopy_atomic_mode mcopy_mode,
|
||||
bool *mmap_changing,
|
||||
__u64 mode)
|
||||
{
|
||||
@@ -469,6 +467,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
|
||||
long copied;
|
||||
struct page *page;
|
||||
bool wp_copy;
|
||||
bool zeropage = (mcopy_mode == MCOPY_ATOMIC_ZEROPAGE);
|
||||
|
||||
/*
|
||||
* Sanitize the command parameters:
|
||||
@@ -527,10 +526,12 @@ retry:
|
||||
*/
|
||||
if (is_vm_hugetlb_page(dst_vma))
|
||||
return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
|
||||
src_start, len, zeropage);
|
||||
src_start, len, mcopy_mode);
|
||||
|
||||
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
|
||||
goto out_unlock;
|
||||
if (mcopy_mode == MCOPY_ATOMIC_CONTINUE)
|
||||
goto out_unlock;
|
||||
|
||||
/*
|
||||
* Ensure the dst_vma has a anon_vma or this page
|
||||
@@ -626,14 +627,22 @@ ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
|
||||
unsigned long src_start, unsigned long len,
|
||||
bool *mmap_changing, __u64 mode)
|
||||
{
|
||||
return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
|
||||
mmap_changing, mode);
|
||||
return __mcopy_atomic(dst_mm, dst_start, src_start, len,
|
||||
MCOPY_ATOMIC_NORMAL, mmap_changing, mode);
|
||||
}
|
||||
|
||||
ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
|
||||
unsigned long len, bool *mmap_changing)
|
||||
{
|
||||
return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing, 0);
|
||||
return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE,
|
||||
mmap_changing, 0);
|
||||
}
|
||||
|
||||
ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
|
||||
unsigned long len, bool *mmap_changing)
|
||||
{
|
||||
return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE,
|
||||
mmap_changing, 0);
|
||||
}
|
||||
|
||||
int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
|
||||
|
||||
Reference in New Issue
Block a user