From e56d181356a44521a96fb844cfc057ad8030dc8d Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Fri, 23 May 2025 15:48:29 +0100 Subject: [PATCH] ANDROID: KVM: arm64: Convert kvm_pinned_pages to an interval-tree Here we are, the last straw... the maple-tree is absolutely terrible for what we need and it is time to get rid of it. With the upcoming set of fixes for memory relinquish with huge-mappings, we need to be able to split a pinned_page under the mmu write_lock. This is just too complicated with the maple-tree while it is a piece of cake with an interval tree. Bug: 419548963 Change-Id: I981b5d875085e1d2f7b4ebe2560c8b6ea3cbae88 Signed-off-by: Vincent Donnefort --- android/abi_gki_aarch64.stg | 39 +++++- android/abi_gki_aarch64.stg.allowed_breaks | 4 + arch/arm64/include/asm/kvm_host.h | 17 ++- arch/arm64/kvm/mmu.c | 134 ++++++++------------- arch/arm64/kvm/pkvm.c | 22 ++-- 5 files changed, 111 insertions(+), 105 deletions(-) diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg index 52b09adda2aa..6b0b35306559 100644 --- a/android/abi_gki_aarch64.stg +++ b/android/abi_gki_aarch64.stg @@ -42469,6 +42469,10 @@ member { id: 0x2c96db22 type_id: 0x3d751c99 } +member { + id: 0x2cd6acbf + type_id: 0x3c74c2ee +} member { id: 0x2d16b3a0 type_id: 0x3b74be91 @@ -43414,6 +43418,11 @@ member { type_id: 0x57bf00b8 offset: 576 } +member { + id: 0x3643ba2c + type_id: 0x56209a0e + offset: 256 +} member { id: 0x36477112 type_id: 0x5633b45b @@ -45547,6 +45556,11 @@ member { offset: 242 bitsize: 14 } +member { + id: 0x906218d7 + name: "__unused" + type_id: 0xa179a8c5 +} member { id: 0x9086e58b name: "__unused" @@ -156569,10 +156583,9 @@ member { offset: 576 } member { - id: 0x03347550 + id: 0x0345ffe6 name: "pinned_pages" - type_id: 0xa179a8c5 - offset: 256 + type_id: 0xd0f3b5bf } member { id: 0x88a7076f @@ -223874,6 +223887,14 @@ struct_union { member_id: 0xc101e64f } } +struct_union { + id: 0x3c74c2ee + kind: STRUCT + definition { + bytesize: 16 + member_id: 0x906218d7 + } +} struct_union { id: 0x3c9f0fa2 kind: STRUCT @@ -225487,6 +225508,16 @@ struct_union { member_id: 0xdf160d99 } } +struct_union { + id: 0x56209a0e + kind: UNION + definition { + bytesize: 16 + member_id: 0x0345ffe6 + member_id: 0x2cd6acbf + member_id: 0x36752b74 + } +} struct_union { id: 0x5633b45b kind: UNION @@ -253468,7 +253499,7 @@ struct_union { bytesize: 64 member_id: 0xb8f5134f member_id: 0x63c436ff - member_id: 0x03347550 + member_id: 0x3643ba2c member_id: 0x0f7f629e member_id: 0x3a2d39cb } diff --git a/android/abi_gki_aarch64.stg.allowed_breaks b/android/abi_gki_aarch64.stg.allowed_breaks index 7fd3832d7e2c..efce896d03cc 100644 --- a/android/abi_gki_aarch64.stg.allowed_breaks +++ b/android/abi_gki_aarch64.stg.allowed_breaks @@ -132,3 +132,7 @@ type 'struct io_ring_ctx' changed 1 variable symbol(s) removed 'struct tracepoint __tracepoint_android_vh_filemap_fault_before_folio_locked' +type 'struct kvm_protected_vm' changed + member 'struct maple_tree pinned_pages' was removed + member 'union { struct rb_root_cached pinned_pages; struct { struct maple_tree __unused; }; union { }; }' was added + diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 7336137bf221..68998b23221d 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -224,20 +224,33 @@ struct kvm_smccc_features { }; struct kvm_pinned_page { + struct rb_node node; struct page *page; u64 ipa; + u64 __subtree_last; u8 order; u16 pins; }; -#define KVM_DUMMY_PPAGE ((struct kvm_pinned_page *)-1) +struct kvm_pinned_page +*kvm_pinned_pages_iter_first(struct rb_root_cached *root, u64 start, u64 end); +struct kvm_pinned_page +*kvm_pinned_pages_iter_next(struct kvm_pinned_page *ppage, u64 start, u64 end); + +#define for_ppage_node_in_range(kvm, start, end, __ppage, __tmp) \ + for (__ppage = kvm_pinned_pages_iter_first(&(kvm)->arch.pkvm.pinned_pages, start, end - 1);\ + __ppage && ({ __tmp = kvm_pinned_pages_iter_next(__ppage, start, end - 1); 1; }); \ + __ppage = __tmp) + +void kvm_pinned_pages_remove(struct kvm_pinned_page *ppage, + struct rb_root_cached *root); typedef unsigned int pkvm_handle_t; struct kvm_protected_vm { pkvm_handle_t handle; struct kvm_hyp_memcache stage2_teardown_mc; - struct maple_tree pinned_pages; + _ANDROID_KABI_REPLACE(struct maple_tree __unused, struct rb_root_cached pinned_pages); gpa_t pvmfw_load_addr; bool enabled; }; diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index c6ec30a19b3b..213957709010 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -6,11 +6,11 @@ #include #include -#include #include #include #include #include +#include #include #include #include @@ -291,6 +291,20 @@ static void invalidate_icache_guest_page(void *va, size_t size) __invalidate_icache_guest_page(va, size); } +static u64 __pinned_page_start(struct kvm_pinned_page *ppage) +{ + return ppage->ipa; +} + +static u64 __pinned_page_end(struct kvm_pinned_page *ppage) +{ + return ppage->ipa + (1 << (ppage->order + PAGE_SHIFT)) - 1; +} + +INTERVAL_TREE_DEFINE(struct kvm_pinned_page, node, u64, __subtree_last, + __pinned_page_start, __pinned_page_end, /* empty */, + kvm_pinned_pages); + static int __pkvm_unmap_guest_call(u64 pfn, u64 gfn, u8 order, void *args) { struct kvm *kvm = args; @@ -312,7 +326,7 @@ static int pkvm_unmap_guest(struct kvm *kvm, struct kvm_pinned_page *ppage) * no update needed from here. */ unpin_user_pages(&ppage->page, 1); - mtree_erase(&kvm->arch.pkvm.pinned_pages, ppage->ipa); + kvm_pinned_pages_remove(ppage, &kvm->arch.pkvm.pinned_pages); kfree(ppage); return 0; @@ -320,17 +334,12 @@ static int pkvm_unmap_guest(struct kvm *kvm, struct kvm_pinned_page *ppage) static int pkvm_unmap_range(struct kvm *kvm, u64 start, u64 end) { + struct kvm_pinned_page *ppage, *tmp; struct mm_struct *mm = kvm->mm; - unsigned long index = start; unsigned long cnt = 0; - void *entry; int ret = 0; - mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, end - 1) { - struct kvm_pinned_page *ppage = entry; - - if (ppage == KVM_DUMMY_PPAGE) - continue; + for_ppage_node_in_range(kvm, start, end, ppage, tmp) { ret = pkvm_unmap_guest(kvm, ppage); if (ret) break; @@ -418,8 +427,7 @@ static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 si static void pkvm_stage2_flush(struct kvm *kvm) { - unsigned long index = 0; - void *entry; + struct kvm_pinned_page *ppage, *tmp; /* * Contrary to stage2_apply_range(), we don't need to check @@ -427,11 +435,7 @@ static void pkvm_stage2_flush(struct kvm *kvm) * from a vcpu thread, and the list is only ever freed on VM * destroy (which only occurs when all vcpu are gone). */ - mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, ULONG_MAX) { - struct kvm_pinned_page *ppage = entry; - - if (ppage == KVM_DUMMY_PPAGE) - continue; + for_ppage_node_in_range(kvm, 0, ULONG_MAX, ppage, tmp) { __clean_dcache_guest_page(page_address(ppage->page), PAGE_SIZE); cond_resched_rwlock_write(&kvm->mmu_lock); } @@ -1014,7 +1018,6 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); - mt_init_flags(&kvm->arch.pkvm.pinned_pages, MT_FLAGS_USE_RCU); mmu->arch = &kvm->arch; if (is_protected_kvm_enabled()) @@ -1293,18 +1296,13 @@ static int __pkvm_wrprotect_call(u64 pfn, u64 gfn, u8 order, void *args) static int pkvm_wp_range(struct kvm *kvm, u64 start, u64 end) { - unsigned long index = start; - void *entry; + struct kvm_pinned_page *ppage, *tmp; - mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, end - 1) { - struct kvm_pinned_page *ppage = entry; + for_ppage_node_in_range(kvm, start, end, ppage, tmp) { int ret; - if (ppage == KVM_DUMMY_PPAGE) - continue; ret = pkvm_call_hyp_nvhe_ppage(ppage, __pkvm_wrprotect_call, kvm, false); - if (ret) return ret; } @@ -1630,28 +1628,9 @@ static int pkvm_host_map_guest(u64 pfn, u64 gfn, u64 nr_pages, return (ret == -EPERM) ? -EAGAIN : ret; } -static struct kvm_pinned_page * -find_ppage_or_above(struct kvm *kvm, phys_addr_t ipa) -{ - unsigned long index = ipa; - void *entry; - - mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, ULONG_MAX) { - if (entry == KVM_DUMMY_PPAGE) - continue; - return entry; - } - - return NULL; -} - static struct kvm_pinned_page *find_ppage(struct kvm *kvm, u64 ipa) { - struct kvm_pinned_page *ppage; - unsigned long index = ipa; - - ppage = mt_find(&kvm->arch.pkvm.pinned_pages, &index, ipa + PAGE_SIZE - 1); - return ppage == KVM_DUMMY_PPAGE ? NULL : ppage; + return kvm_pinned_pages_iter_first(&kvm->arch.pkvm.pinned_pages, ipa, ipa + PAGE_SIZE - 1); } static int __pkvm_relax_perms_call(u64 pfn, u64 gfn, u8 order, void *args) @@ -1707,11 +1686,10 @@ static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t *fault_ipa, { unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE; struct kvm_hyp_memcache *hyp_memcache = &vcpu->arch.stage2_mc; - unsigned long index, pmd_offset, page_size, end; + unsigned long page_size = PAGE_SIZE; struct mm_struct *mm = current->mm; struct kvm_pinned_page *ppage; struct kvm *kvm = vcpu->kvm; - struct maple_tree *mt = &kvm->arch.pkvm.pinned_pages; int ret, nr_pages; struct page *page; u64 pfn; @@ -1760,66 +1738,49 @@ static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t *fault_ipa, } pfn = page_to_pfn(page); - pmd_offset = *fault_ipa & (PMD_SIZE - 1); - page_size = transparent_hugepage_adjust(kvm, memslot, - hva, &pfn, - fault_ipa); - page = pfn_to_page(pfn); -retry: - if (size) - *size = page_size; + read_lock(&kvm->mmu_lock); + if (!kvm_pinned_pages_iter_first(&vcpu->kvm->arch.pkvm.pinned_pages, + ALIGN_DOWN(*fault_ipa, PMD_SIZE), + ALIGN(*fault_ipa + 1, PMD_SIZE) - 1)) + page_size = transparent_hugepage_adjust(kvm, memslot, hva, &pfn, fault_ipa); + + /* + * We take the risk of racing with another vCPU, but sync will be restored by the + * host_map_guest HVC + */ + read_unlock(&kvm->mmu_lock); + + page = pfn_to_page(pfn); ret = account_locked_vm(mm, page_size >> PAGE_SHIFT, true); if (ret) goto unpin; - index = *fault_ipa; - end = index + page_size - 1; ppage->page = page; ppage->ipa = *fault_ipa; ppage->order = get_order(page_size); ppage->pins = 1 << ppage->order; - /* - * If we already have a mapping in the middle of the THP, we have no - * other choice than enforcing PAGE_SIZE for pkvm_host_map_guest() to - * succeed. - */ - if (page_size > PAGE_SIZE && mt_find(mt, &index, end)) { - *fault_ipa += pmd_offset; - pfn += pmd_offset >> PAGE_SHIFT; - page = pfn_to_page(pfn); - account_locked_vm(mm, page_size >> PAGE_SHIFT, false); - page_size = PAGE_SIZE; - goto retry; - } - - /* Reserve space in the mtree */ - ret = mtree_insert_range(mt, index, end, KVM_DUMMY_PPAGE, GFP_KERNEL); - if (ret) { - if (ret == -EEXIST) - ret = 0; - goto dec_account; - } - write_lock(&kvm->mmu_lock); ret = pkvm_host_map_guest(pfn, *fault_ipa >> PAGE_SHIFT, page_size >> PAGE_SHIFT, KVM_PGTABLE_PROT_R); if (ret) { - if (WARN_ON(ret == -EAGAIN)) + if (ret == -EAGAIN) ret = 0; goto err_unlock; } - WARN_ON(mtree_store_range(mt, index, end, ppage, GFP_ATOMIC)); + kvm_pinned_pages_insert(ppage, &kvm->arch.pkvm.pinned_pages); write_unlock(&kvm->mmu_lock); + if (size) + *size = page_size; + return 0; err_unlock: write_unlock(&kvm->mmu_lock); -dec_account: account_locked_vm(mm, page_size >> PAGE_SHIFT, false); unpin: unpin_user_pages(&page, 1); @@ -1847,13 +1808,13 @@ int pkvm_mem_abort_range(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, size_t si idx = srcu_read_lock(&vcpu->kvm->srcu); read_lock(&vcpu->kvm->mmu_lock); - ppage = find_ppage_or_above(vcpu->kvm, fault_ipa); + ppage = kvm_pinned_pages_iter_first(&vcpu->kvm->arch.pkvm.pinned_pages, + fault_ipa, ipa_end); while (fault_ipa < ipa_end) { - if (ppage && ppage != KVM_DUMMY_PPAGE && ppage->ipa == fault_ipa) { + if (ppage && ppage->ipa == fault_ipa) { page_size = PAGE_SIZE << ppage->order; - ppage = mt_next(&vcpu->kvm->arch.pkvm.pinned_pages, - ppage->ipa, ULONG_MAX); + ppage = kvm_pinned_pages_iter_next(ppage, fault_ipa, ipa_end); } else { gfn_t gfn = gpa_to_gfn(fault_ipa); struct kvm_memory_slot *memslot; @@ -1877,7 +1838,8 @@ int pkvm_mem_abort_range(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, size_t si * We had to release the mmu_lock so let's update the * reference. */ - ppage = find_ppage_or_above(vcpu->kvm, fault_ipa + page_size); + ppage = kvm_pinned_pages_iter_first(&vcpu->kvm->arch.pkvm.pinned_pages, + fault_ipa + PAGE_SIZE, ipa_end); } fault_ipa += page_size; diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index 1b9334136f8e..22f31d53c3e0 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -319,21 +319,17 @@ static int __reclaim_dying_guest_page_call(u64 pfn, u64 gfn, u8 order, void *arg static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm) { + struct kvm_pinned_page *tmp, *ppage; struct mm_struct *mm = current->mm; - struct kvm_pinned_page *ppage; struct kvm_vcpu *host_vcpu; - unsigned long idx, ipa = 0; + unsigned long idx; if (!host_kvm->arch.pkvm.handle) goto out_free; WARN_ON(kvm_call_hyp_nvhe(__pkvm_start_teardown_vm, host_kvm->arch.pkvm.handle)); - mt_clear_in_rcu(&host_kvm->arch.pkvm.pinned_pages); - - mt_for_each(&host_kvm->arch.pkvm.pinned_pages, ppage, ipa, ULONG_MAX) { - if (WARN_ON(ppage == KVM_DUMMY_PPAGE)) - continue; + for_ppage_node_in_range(host_kvm, 0, ULONG_MAX, ppage, tmp) { WARN_ON(pkvm_call_hyp_nvhe_ppage(ppage, __reclaim_dying_guest_page_call, host_kvm, true)); @@ -341,9 +337,9 @@ static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm) account_locked_vm(mm, 1, false); unpin_user_pages_dirty_lock(&ppage->page, 1, host_kvm->arch.pkvm.enabled); + kvm_pinned_pages_remove(ppage, &host_kvm->arch.pkvm.pinned_pages); kfree(ppage); } - mtree_destroy(&host_kvm->arch.pkvm.pinned_pages); WARN_ON(kvm_call_hyp_nvhe(__pkvm_finalize_teardown_vm, host_kvm->arch.pkvm.handle)); @@ -538,13 +534,12 @@ void pkvm_host_reclaim_page(struct kvm *host_kvm, phys_addr_t ipa) { struct mm_struct *mm = current->mm; struct kvm_pinned_page *ppage; - unsigned long index = ipa; u16 pins; write_lock(&host_kvm->mmu_lock); - ppage = mt_find(&host_kvm->arch.pkvm.pinned_pages, &index, - index + PAGE_SIZE - 1); - if (ppage && ppage != KVM_DUMMY_PPAGE) { + ppage = kvm_pinned_pages_iter_first(&host_kvm->arch.pkvm.pinned_pages, + ipa, ipa + PAGE_SIZE - 1); + if (ppage) { if (ppage->pins) ppage->pins--; else @@ -552,7 +547,8 @@ void pkvm_host_reclaim_page(struct kvm *host_kvm, phys_addr_t ipa) pins = ppage->pins; if (!pins) - mtree_erase(&host_kvm->arch.pkvm.pinned_pages, ipa); + kvm_pinned_pages_remove(ppage, + &host_kvm->arch.pkvm.pinned_pages); } write_unlock(&host_kvm->mmu_lock);