ANDROID: KVM: arm64: Convert kvm_pinned_pages to an interval-tree

Here we are, the last straw... the maple-tree is absolutely terrible for what we need and it is time to get rid of it. With the upcoming set of fixes for memory relinquish with huge-mappings, we need to be able to split a pinned_page under the mmu write_lock. This is just too complicated with the maple-tree while it is a piece of cake with an interval tree. Bug: 419548963 Change-Id: I981b5d875085e1d2f7b4ebe2560c8b6ea3cbae88 Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
2025-05-23 15:48:29 +01:00
parent 390699f93d
commit e56d181356
5 changed files with 111 additions and 105 deletions
--- a/android/abi_gki_aarch64.stg
+++ b/android/abi_gki_aarch64.stg
@@ -42469,6 +42469,10 @@ member {
  id: 0x2c96db22
  type_id: 0x3d751c99
 }
+member {
+  id: 0x2cd6acbf
+  type_id: 0x3c74c2ee
+}
 member {
  id: 0x2d16b3a0
  type_id: 0x3b74be91
@@ -43414,6 +43418,11 @@ member {
  type_id: 0x57bf00b8
  offset: 576
 }
+member {
+  id: 0x3643ba2c
+  type_id: 0x56209a0e
+  offset: 256
+}
 member {
  id: 0x36477112
  type_id: 0x5633b45b
@@ -45547,6 +45556,11 @@ member {
  offset: 242
  bitsize: 14
 }
+member {
+  id: 0x906218d7
+  name: "__unused"
+  type_id: 0xa179a8c5
+}
 member {
  id: 0x9086e58b
  name: "__unused"
@@ -156569,10 +156583,9 @@ member {
  offset: 576
 }
 member {
-  id: 0x03347550
+  id: 0x0345ffe6
  name: "pinned_pages"
-  type_id: 0xa179a8c5
-  offset: 256
+  type_id: 0xd0f3b5bf
 }
 member {
  id: 0x88a7076f
@@ -223874,6 +223887,14 @@ struct_union {
    member_id: 0xc101e64f
  }
 }
+struct_union {
+  id: 0x3c74c2ee
+  kind: STRUCT
+  definition {
+    bytesize: 16
+    member_id: 0x906218d7
+  }
+}
 struct_union {
  id: 0x3c9f0fa2
  kind: STRUCT
@@ -225487,6 +225508,16 @@ struct_union {
    member_id: 0xdf160d99
  }
 }
+struct_union {
+  id: 0x56209a0e
+  kind: UNION
+  definition {
+    bytesize: 16
+    member_id: 0x0345ffe6
+    member_id: 0x2cd6acbf
+    member_id: 0x36752b74
+  }
+}
 struct_union {
  id: 0x5633b45b
  kind: UNION
@@ -253468,7 +253499,7 @@ struct_union {
    bytesize: 64
    member_id: 0xb8f5134f
    member_id: 0x63c436ff
-    member_id: 0x03347550
+    member_id: 0x3643ba2c
    member_id: 0x0f7f629e
    member_id: 0x3a2d39cb
  }
--- a/android/abi_gki_aarch64.stg.allowed_breaks
+++ b/android/abi_gki_aarch64.stg.allowed_breaks
@@ -132,3 +132,7 @@ type 'struct io_ring_ctx' changed
 1 variable symbol(s) removed
  'struct tracepoint __tracepoint_android_vh_filemap_fault_before_folio_locked'

+type 'struct kvm_protected_vm' changed
+  member 'struct maple_tree pinned_pages' was removed
+  member 'union { struct rb_root_cached pinned_pages; struct { struct maple_tree __unused; }; union { }; }' was added
+
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -224,20 +224,33 @@ struct kvm_smccc_features {
 };

 struct kvm_pinned_page {
+	struct rb_node		node;
 	struct page		*page;
 	u64			ipa;
+	u64			__subtree_last;
 	u8			order;
 	u16			pins;
 };

-#define KVM_DUMMY_PPAGE ((struct kvm_pinned_page *)-1)
+struct kvm_pinned_page
+*kvm_pinned_pages_iter_first(struct rb_root_cached *root, u64 start, u64 end);
+struct kvm_pinned_page
+*kvm_pinned_pages_iter_next(struct kvm_pinned_page *ppage, u64 start, u64 end);
+
+#define for_ppage_node_in_range(kvm, start, end, __ppage, __tmp)				\
+	for (__ppage = kvm_pinned_pages_iter_first(&(kvm)->arch.pkvm.pinned_pages, start, end - 1);\
+	    __ppage && ({ __tmp = kvm_pinned_pages_iter_next(__ppage, start, end - 1); 1; });	\
+	    __ppage = __tmp)
+
+void kvm_pinned_pages_remove(struct kvm_pinned_page *ppage,
+			     struct rb_root_cached *root);

 typedef unsigned int pkvm_handle_t;

 struct kvm_protected_vm {
 	pkvm_handle_t handle;
 	struct kvm_hyp_memcache stage2_teardown_mc;
-	struct maple_tree pinned_pages;
+	_ANDROID_KABI_REPLACE(struct maple_tree __unused, struct rb_root_cached pinned_pages);
 	gpa_t pvmfw_load_addr;
 	bool enabled;
 };
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -6,11 +6,11 @@

 #include <linux/cma.h>
 #include <linux/dma-map-ops.h>
-#include <linux/maple_tree.h>
 #include <linux/mman.h>
 #include <linux/kvm_host.h>
 #include <linux/io.h>
 #include <linux/hugetlb.h>
+#include <linux/interval_tree_generic.h>
 #include <linux/sched/signal.h>
 #include <trace/events/kvm.h>
 #include <asm/pgalloc.h>
@@ -291,6 +291,20 @@ static void invalidate_icache_guest_page(void *va, size_t size)
 	__invalidate_icache_guest_page(va, size);
 }

+static u64 __pinned_page_start(struct kvm_pinned_page *ppage)
+{
+	return ppage->ipa;
+}
+
+static u64 __pinned_page_end(struct kvm_pinned_page *ppage)
+{
+	return ppage->ipa + (1 << (ppage->order + PAGE_SHIFT)) - 1;
+}
+
+INTERVAL_TREE_DEFINE(struct kvm_pinned_page, node, u64, __subtree_last,
+		     __pinned_page_start, __pinned_page_end, /* empty */,
+		     kvm_pinned_pages);
+
 static int __pkvm_unmap_guest_call(u64 pfn, u64 gfn, u8 order, void *args)
 {
 	struct kvm *kvm = args;
@@ -312,7 +326,7 @@ static int pkvm_unmap_guest(struct kvm *kvm, struct kvm_pinned_page *ppage)
 	 * no update needed from here.
 	 */
 	unpin_user_pages(&ppage->page, 1);
-	mtree_erase(&kvm->arch.pkvm.pinned_pages, ppage->ipa);
+	kvm_pinned_pages_remove(ppage, &kvm->arch.pkvm.pinned_pages);
 	kfree(ppage);

 	return 0;
@@ -320,17 +334,12 @@ static int pkvm_unmap_guest(struct kvm *kvm, struct kvm_pinned_page *ppage)

 static int pkvm_unmap_range(struct kvm *kvm, u64 start, u64 end)
 {
+	struct kvm_pinned_page *ppage, *tmp;
 	struct mm_struct *mm = kvm->mm;
-	unsigned long index = start;
 	unsigned long cnt = 0;
-	void *entry;
 	int ret = 0;

-	mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, end - 1) {
-		struct kvm_pinned_page *ppage = entry;
-
-		if (ppage == KVM_DUMMY_PPAGE)
-			continue;
+	for_ppage_node_in_range(kvm, start, end, ppage, tmp) {
 		ret = pkvm_unmap_guest(kvm, ppage);
 		if (ret)
 			break;
@@ -418,8 +427,7 @@ static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 si

 static void pkvm_stage2_flush(struct kvm *kvm)
 {
-	unsigned long index = 0;
-	void *entry;
+	struct kvm_pinned_page *ppage, *tmp;

 	/*
 	 * Contrary to stage2_apply_range(), we don't need to check
@@ -427,11 +435,7 @@ static void pkvm_stage2_flush(struct kvm *kvm)
 	 * from a vcpu thread, and the list is only ever freed on VM
 	 * destroy (which only occurs when all vcpu are gone).
 	 */
-	mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, ULONG_MAX) {
-		struct kvm_pinned_page *ppage = entry;
-
-		if (ppage == KVM_DUMMY_PPAGE)
-			continue;
+	for_ppage_node_in_range(kvm, 0, ULONG_MAX, ppage, tmp) {
 		__clean_dcache_guest_page(page_address(ppage->page), PAGE_SIZE);
 		cond_resched_rwlock_write(&kvm->mmu_lock);
 	}
@@ -1014,7 +1018,6 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
 	mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
 	mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
 	kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
-	mt_init_flags(&kvm->arch.pkvm.pinned_pages, MT_FLAGS_USE_RCU);
 	mmu->arch = &kvm->arch;

 	if (is_protected_kvm_enabled())
@@ -1293,18 +1296,13 @@ static int __pkvm_wrprotect_call(u64 pfn, u64 gfn, u8 order, void *args)

 static int pkvm_wp_range(struct kvm *kvm, u64 start, u64 end)
 {
-	unsigned long index = start;
-	void *entry;
+	struct kvm_pinned_page *ppage, *tmp;

-	mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, end - 1) {
-		struct kvm_pinned_page *ppage = entry;
+	for_ppage_node_in_range(kvm, start, end, ppage, tmp) {
 		int ret;

-		if (ppage == KVM_DUMMY_PPAGE)
-			continue;
 		ret = pkvm_call_hyp_nvhe_ppage(ppage, __pkvm_wrprotect_call,
 						   kvm, false);
-
 		if (ret)
 			return ret;
 	}
@@ -1630,28 +1628,9 @@ static int pkvm_host_map_guest(u64 pfn, u64 gfn, u64 nr_pages,
 	return (ret == -EPERM) ? -EAGAIN : ret;
 }

-static struct kvm_pinned_page *
-find_ppage_or_above(struct kvm *kvm, phys_addr_t ipa)
-{
-	unsigned long index = ipa;
-	void *entry;
-
-	mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, ULONG_MAX) {
-		if (entry == KVM_DUMMY_PPAGE)
-			continue;
-		return entry;
-	}
-
-	return NULL;
-}
-
 static struct kvm_pinned_page *find_ppage(struct kvm *kvm, u64 ipa)
 {
-	struct kvm_pinned_page *ppage;
-	unsigned long index = ipa;
-
-	ppage = mt_find(&kvm->arch.pkvm.pinned_pages, &index, ipa + PAGE_SIZE - 1);
-	return ppage == KVM_DUMMY_PPAGE ? NULL : ppage;
+	return kvm_pinned_pages_iter_first(&kvm->arch.pkvm.pinned_pages, ipa, ipa + PAGE_SIZE - 1);
 }

 static int __pkvm_relax_perms_call(u64 pfn, u64 gfn, u8 order, void *args)
@@ -1707,11 +1686,10 @@ static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t *fault_ipa,
 {
 	unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE;
 	struct kvm_hyp_memcache *hyp_memcache = &vcpu->arch.stage2_mc;
-	unsigned long index, pmd_offset, page_size, end;
+	unsigned long page_size = PAGE_SIZE;
 	struct mm_struct *mm = current->mm;
 	struct kvm_pinned_page *ppage;
 	struct kvm *kvm = vcpu->kvm;
-	struct maple_tree *mt = &kvm->arch.pkvm.pinned_pages;
 	int ret, nr_pages;
 	struct page *page;
 	u64 pfn;
@@ -1760,66 +1738,49 @@ static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t *fault_ipa,
 	}

 	pfn = page_to_pfn(page);
-	pmd_offset = *fault_ipa & (PMD_SIZE - 1);
-	page_size = transparent_hugepage_adjust(kvm, memslot,
-						hva, &pfn,
-						fault_ipa);
-	page = pfn_to_page(pfn);

-retry:
-	if (size)
-		*size = page_size;
+	read_lock(&kvm->mmu_lock);
+	if (!kvm_pinned_pages_iter_first(&vcpu->kvm->arch.pkvm.pinned_pages,
+					 ALIGN_DOWN(*fault_ipa, PMD_SIZE),
+					 ALIGN(*fault_ipa + 1, PMD_SIZE) - 1))
+		page_size = transparent_hugepage_adjust(kvm, memslot, hva, &pfn, fault_ipa);
+
+	/*
+	 * We take the risk of racing with another vCPU, but sync will be restored by the
+	 * host_map_guest HVC
+	 */
+	read_unlock(&kvm->mmu_lock);
+
+	page = pfn_to_page(pfn);

 	ret = account_locked_vm(mm, page_size >> PAGE_SHIFT, true);
 	if (ret)
 		goto unpin;

-	index = *fault_ipa;
-	end = index + page_size - 1;
 	ppage->page = page;
 	ppage->ipa = *fault_ipa;
 	ppage->order = get_order(page_size);
 	ppage->pins = 1 << ppage->order;

-	/*
-	 * If we already have a mapping in the middle of the THP, we have no
-	 * other choice than enforcing PAGE_SIZE for pkvm_host_map_guest() to
-	 * succeed.
-	 */
-	if (page_size > PAGE_SIZE && mt_find(mt, &index, end)) {
-		*fault_ipa += pmd_offset;
-		pfn += pmd_offset >> PAGE_SHIFT;
-		page = pfn_to_page(pfn);
-		account_locked_vm(mm, page_size >> PAGE_SHIFT, false);
-		page_size = PAGE_SIZE;
-		goto retry;
-	}
-
-	/* Reserve space in the mtree */
-	ret = mtree_insert_range(mt, index, end, KVM_DUMMY_PPAGE, GFP_KERNEL);
-	if (ret) {
-		if (ret == -EEXIST)
-			ret = 0;
-		goto dec_account;
-	}
-
 	write_lock(&kvm->mmu_lock);
 	ret = pkvm_host_map_guest(pfn, *fault_ipa >> PAGE_SHIFT,
 				  page_size >> PAGE_SHIFT, KVM_PGTABLE_PROT_R);
 	if (ret) {
-		if (WARN_ON(ret == -EAGAIN))
+		if (ret == -EAGAIN)
 			ret = 0;

 		goto err_unlock;
 	}
-	WARN_ON(mtree_store_range(mt, index, end, ppage, GFP_ATOMIC));
+	kvm_pinned_pages_insert(ppage, &kvm->arch.pkvm.pinned_pages);
 	write_unlock(&kvm->mmu_lock);

+	if (size)
+		*size = page_size;
+
 	return 0;

 err_unlock:
 	write_unlock(&kvm->mmu_lock);
-dec_account:
 	account_locked_vm(mm, page_size >> PAGE_SHIFT, false);
 unpin:
 	unpin_user_pages(&page, 1);
@@ -1847,13 +1808,13 @@ int pkvm_mem_abort_range(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, size_t si
 	idx = srcu_read_lock(&vcpu->kvm->srcu);

 	read_lock(&vcpu->kvm->mmu_lock);
-	ppage = find_ppage_or_above(vcpu->kvm, fault_ipa);
+	ppage = kvm_pinned_pages_iter_first(&vcpu->kvm->arch.pkvm.pinned_pages,
+					    fault_ipa, ipa_end);

 	while (fault_ipa < ipa_end) {
-		if (ppage && ppage != KVM_DUMMY_PPAGE && ppage->ipa == fault_ipa) {
+		if (ppage && ppage->ipa == fault_ipa) {
 			page_size = PAGE_SIZE << ppage->order;
-			ppage = mt_next(&vcpu->kvm->arch.pkvm.pinned_pages,
-					ppage->ipa, ULONG_MAX);
+			ppage = kvm_pinned_pages_iter_next(ppage, fault_ipa, ipa_end);
 		} else {
 			gfn_t gfn = gpa_to_gfn(fault_ipa);
 			struct kvm_memory_slot *memslot;
@@ -1877,7 +1838,8 @@ int pkvm_mem_abort_range(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, size_t si
 			 * We had to release the mmu_lock so let's update the
 			 * reference.
 			 */
-			ppage = find_ppage_or_above(vcpu->kvm, fault_ipa + page_size);
+			ppage = kvm_pinned_pages_iter_first(&vcpu->kvm->arch.pkvm.pinned_pages,
+							    fault_ipa + PAGE_SIZE, ipa_end);
 		}

 		fault_ipa += page_size;
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -319,21 +319,17 @@ static int __reclaim_dying_guest_page_call(u64 pfn, u64 gfn, u8 order, void *arg

 static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm)
 {
+	struct kvm_pinned_page *tmp, *ppage;
 	struct mm_struct *mm = current->mm;
-	struct kvm_pinned_page *ppage;
 	struct kvm_vcpu *host_vcpu;
-	unsigned long idx, ipa = 0;
+	unsigned long idx;

 	if (!host_kvm->arch.pkvm.handle)
 		goto out_free;

 	WARN_ON(kvm_call_hyp_nvhe(__pkvm_start_teardown_vm, host_kvm->arch.pkvm.handle));

-	mt_clear_in_rcu(&host_kvm->arch.pkvm.pinned_pages);
-
-	mt_for_each(&host_kvm->arch.pkvm.pinned_pages, ppage, ipa, ULONG_MAX) {
-		if (WARN_ON(ppage == KVM_DUMMY_PPAGE))
-			continue;
+	for_ppage_node_in_range(host_kvm, 0, ULONG_MAX, ppage, tmp) {
 		WARN_ON(pkvm_call_hyp_nvhe_ppage(ppage,
 						 __reclaim_dying_guest_page_call,
 						 host_kvm, true));
@@ -341,9 +337,9 @@ static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm)

 		account_locked_vm(mm, 1, false);
 		unpin_user_pages_dirty_lock(&ppage->page, 1, host_kvm->arch.pkvm.enabled);
+		kvm_pinned_pages_remove(ppage, &host_kvm->arch.pkvm.pinned_pages);
 		kfree(ppage);
 	}
-	mtree_destroy(&host_kvm->arch.pkvm.pinned_pages);

 	WARN_ON(kvm_call_hyp_nvhe(__pkvm_finalize_teardown_vm, host_kvm->arch.pkvm.handle));

@@ -538,13 +534,12 @@ void pkvm_host_reclaim_page(struct kvm *host_kvm, phys_addr_t ipa)
 {
 	struct mm_struct *mm = current->mm;
 	struct kvm_pinned_page *ppage;
-	unsigned long index = ipa;
 	u16 pins;

 	write_lock(&host_kvm->mmu_lock);
-	ppage = mt_find(&host_kvm->arch.pkvm.pinned_pages, &index,
-			index + PAGE_SIZE - 1);
-	if (ppage && ppage != KVM_DUMMY_PPAGE) {
+	ppage = kvm_pinned_pages_iter_first(&host_kvm->arch.pkvm.pinned_pages,
+					    ipa, ipa + PAGE_SIZE - 1);
+	if (ppage) {
 		if (ppage->pins)
 			ppage->pins--;
 		else
@@ -552,7 +547,8 @@ void pkvm_host_reclaim_page(struct kvm *host_kvm, phys_addr_t ipa)

 		pins = ppage->pins;
 		if (!pins)
-			mtree_erase(&host_kvm->arch.pkvm.pinned_pages, ipa);
+			kvm_pinned_pages_remove(ppage,
+						&host_kvm->arch.pkvm.pinned_pages);
 	}
 	write_unlock(&host_kvm->mmu_lock);