diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 054612a2a7fc..04f9050c482d 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -210,6 +210,20 @@ enum kvm_pgtable_prot { #define PKVM_HOST_MEM_PROT KVM_PGTABLE_PROT_RWX #define PKVM_HOST_MMIO_PROT KVM_PGTABLE_PROT_RW +#define KVM_HOST_S2_DEFAULT_MASK (KVM_PTE_LEAF_ATTR_HI | \ + KVM_PTE_LEAF_ATTR_LO) + +#define KVM_HOST_S2_DEFAULT_MEM_PTE \ + (PTE_S2_MEMATTR(MT_S2_NORMAL) | \ + KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \ + KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ + KVM_PTE_LEAF_ATTR_LO_S2_AF | \ + FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, KVM_PTE_LEAF_ATTR_LO_S2_SH_IS)) + +#define KVM_HOST_S2_DEFAULT_MMIO_PTE \ + (KVM_HOST_S2_DEFAULT_MEM_PTE | \ + KVM_PTE_LEAF_ATTR_HI_S2_XN) + #define PAGE_HYP KVM_PGTABLE_PROT_RW #define PAGE_HYP_EXEC (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X) #define PAGE_HYP_RO (KVM_PGTABLE_PROT_R) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 735bd4c0c718..74ef0d062719 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -182,7 +182,12 @@ static bool guest_stage2_force_pte_cb(u64 addr, u64 end, static bool guest_stage2_pte_is_counted(kvm_pte_t pte, u32 level) { - return host_stage2_pte_is_counted(pte, level); + /* + * The refcount tracks valid entries as well as invalid entries if they + * encode ownership of a page to another entity than the page-table + * owner, whose id is 0. + */ + return !!pte; } static void *guest_s2_zalloc_pages_exact(size_t size) @@ -617,12 +622,20 @@ static bool host_stage2_force_pte(u64 addr, u64 end, enum kvm_pgtable_prot prot) static bool host_stage2_pte_is_counted(kvm_pte_t pte, u32 level) { - /* - * The refcount tracks valid entries as well as invalid entries if they - * encode ownership of a page to another entity than the page-table - * owner, whose id is 0. - */ - return !!pte; + u64 phys; + + if (!kvm_pte_valid(pte)) + return !!pte; + + if (kvm_pte_table(pte, level)) + return true; + + phys = kvm_pte_to_phys(pte); + if (addr_is_memory(phys)) + return (pte & KVM_HOST_S2_DEFAULT_MASK) != + KVM_HOST_S2_DEFAULT_MEM_PTE; + + return (pte & KVM_HOST_S2_DEFAULT_MASK) != KVM_HOST_S2_DEFAULT_MMIO_PTE; } static int host_stage2_idmap(u64 addr) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index e48b66b744d5..0add34ea9ff2 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -690,25 +690,26 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, else new = data->annotation; - if (pte_ops->pte_is_counted_cb(old, level)) { - /* - * Skip updating the PTE if we are trying to recreate the exact - * same mapping or only change the access permissions. Instead, - * the vCPU will exit one more time from guest if still needed - * and then go through the path of relaxing permissions. - */ - if (!stage2_pte_needs_update(old, new)) - return -EAGAIN; + /* + * Skip updating the PTE if we are trying to recreate the exact + * same mapping or only change the access permissions. Instead, + * the vCPU will exit one more time from guest if still needed + * and then go through the path of relaxing permissions. + */ + if (!stage2_pte_needs_update(old, new)) + return -EAGAIN; - /* - * If we're only changing software bits, then we don't need to - * do anything else/ - */ - if (!((old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) - goto out_set_pte; + if (pte_ops->pte_is_counted_cb(old, level)) + mm_ops->put_page(ptep); - stage2_put_pte(ptep, data->mmu, addr, level, mm_ops); - } + /* + * If we're only changing software bits, then we don't need to + * do anything else. + */ + if (!((old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) + goto out_set_pte; + + stage2_clear_pte(ptep, data->mmu, addr, level); /* Perform CMOs before installation of the guest stage-2 PTE */ if (mm_ops->dcache_clean_inval_poc && stage2_pte_cacheable(pgt, new)) @@ -717,10 +718,10 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, if (mm_ops->icache_inval_pou && stage2_pte_executable(new)) mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule); +out_set_pte: if (pte_ops->pte_is_counted_cb(new, level)) mm_ops->get_page(ptep); -out_set_pte: smp_store_release(ptep, new); if (kvm_phys_is_valid(phys)) data->phys += granule; @@ -785,8 +786,15 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, * a table. Accesses beyond 'end' that fall within the new table * will be mapped lazily. */ - if (pte_ops->pte_is_counted_cb(pte, level)) + if (pte_ops->pte_is_counted_cb(pte, level)) { stage2_put_pte(ptep, data->mmu, addr, level, mm_ops); + } else { + /* + * On non-refcounted PTEs we just clear them out without + * dropping the refcount. + */ + stage2_clear_pte(ptep, data->mmu, addr, level); + } kvm_set_table_pte(ptep, childp, mm_ops); mm_ops->get_page(ptep); @@ -794,6 +802,35 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, return 0; } +static void stage2_coalesce_walk_table_post(u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, + struct stage2_map_data *data) +{ + struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; + kvm_pte_t *childp = kvm_pte_follow(*ptep, mm_ops); + + /* + * Decrement the refcount only on the set ownership path to avoid a + * loop situation when the following happens: + * 1. We take a host stage2 fault and we create a small mapping which + * has default attributes (is not refcounted). + * 2. On the way back we execute the post handler and we zap the + * table that holds our mapping. + */ + if (kvm_phys_is_valid(data->phys) || + !kvm_level_supports_block_mapping(level)) + return; + + /* + * Free a page that is not referenced anymore and drop the reference + * of the page table page. + */ + if (mm_ops->page_count(childp) == 1) { + stage2_put_pte(ptep, data->mmu, addr, level, mm_ops); + mm_ops->put_page(childp); + } +} + static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, struct stage2_map_data *data) @@ -802,8 +839,11 @@ static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level, kvm_pte_t *childp; int ret = 0; - if (!data->anchor) + if (!data->anchor) { + stage2_coalesce_walk_table_post(addr, end, level, ptep, + data); return 0; + } if (data->anchor == ptep) { childp = data->childp; @@ -951,7 +991,8 @@ static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, * block entry and rely on the remaining portions being faulted * back lazily. */ - stage2_put_pte(ptep, mmu, addr, level, mm_ops); + if (pte_ops->pte_is_counted_cb(pte, level)) + stage2_put_pte(ptep, mmu, addr, level, mm_ops); if (need_flush && mm_ops->dcache_clean_inval_poc) mm_ops->dcache_clean_inval_poc(kvm_pte_follow(pte, mm_ops),