ANDROID: KVM: arm64: Convert kvm_pinned_pages to an interval-tree

Here we are, the last straw... the maple-tree is absolutely terrible for
what we need and it is time to get rid of it. With the upcoming set of
fixes for memory relinquish with huge-mappings, we need to be able to
split a pinned_page under the mmu write_lock. This is just too
complicated with the maple-tree while it is a piece of cake with an
interval tree.

Bug: 419548963
Change-Id: I981b5d875085e1d2f7b4ebe2560c8b6ea3cbae88
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
This commit is contained in:
Vincent Donnefort
2025-05-23 15:48:29 +01:00
parent 390699f93d
commit e56d181356
5 changed files with 111 additions and 105 deletions

View File

@@ -42469,6 +42469,10 @@ member {
id: 0x2c96db22 id: 0x2c96db22
type_id: 0x3d751c99 type_id: 0x3d751c99
} }
member {
id: 0x2cd6acbf
type_id: 0x3c74c2ee
}
member { member {
id: 0x2d16b3a0 id: 0x2d16b3a0
type_id: 0x3b74be91 type_id: 0x3b74be91
@@ -43414,6 +43418,11 @@ member {
type_id: 0x57bf00b8 type_id: 0x57bf00b8
offset: 576 offset: 576
} }
member {
id: 0x3643ba2c
type_id: 0x56209a0e
offset: 256
}
member { member {
id: 0x36477112 id: 0x36477112
type_id: 0x5633b45b type_id: 0x5633b45b
@@ -45547,6 +45556,11 @@ member {
offset: 242 offset: 242
bitsize: 14 bitsize: 14
} }
member {
id: 0x906218d7
name: "__unused"
type_id: 0xa179a8c5
}
member { member {
id: 0x9086e58b id: 0x9086e58b
name: "__unused" name: "__unused"
@@ -156569,10 +156583,9 @@ member {
offset: 576 offset: 576
} }
member { member {
id: 0x03347550 id: 0x0345ffe6
name: "pinned_pages" name: "pinned_pages"
type_id: 0xa179a8c5 type_id: 0xd0f3b5bf
offset: 256
} }
member { member {
id: 0x88a7076f id: 0x88a7076f
@@ -223874,6 +223887,14 @@ struct_union {
member_id: 0xc101e64f member_id: 0xc101e64f
} }
} }
struct_union {
id: 0x3c74c2ee
kind: STRUCT
definition {
bytesize: 16
member_id: 0x906218d7
}
}
struct_union { struct_union {
id: 0x3c9f0fa2 id: 0x3c9f0fa2
kind: STRUCT kind: STRUCT
@@ -225487,6 +225508,16 @@ struct_union {
member_id: 0xdf160d99 member_id: 0xdf160d99
} }
} }
struct_union {
id: 0x56209a0e
kind: UNION
definition {
bytesize: 16
member_id: 0x0345ffe6
member_id: 0x2cd6acbf
member_id: 0x36752b74
}
}
struct_union { struct_union {
id: 0x5633b45b id: 0x5633b45b
kind: UNION kind: UNION
@@ -253468,7 +253499,7 @@ struct_union {
bytesize: 64 bytesize: 64
member_id: 0xb8f5134f member_id: 0xb8f5134f
member_id: 0x63c436ff member_id: 0x63c436ff
member_id: 0x03347550 member_id: 0x3643ba2c
member_id: 0x0f7f629e member_id: 0x0f7f629e
member_id: 0x3a2d39cb member_id: 0x3a2d39cb
} }

View File

@@ -132,3 +132,7 @@ type 'struct io_ring_ctx' changed
1 variable symbol(s) removed 1 variable symbol(s) removed
'struct tracepoint __tracepoint_android_vh_filemap_fault_before_folio_locked' 'struct tracepoint __tracepoint_android_vh_filemap_fault_before_folio_locked'
type 'struct kvm_protected_vm' changed
member 'struct maple_tree pinned_pages' was removed
member 'union { struct rb_root_cached pinned_pages; struct { struct maple_tree __unused; }; union { }; }' was added

View File

@@ -224,20 +224,33 @@ struct kvm_smccc_features {
}; };
struct kvm_pinned_page { struct kvm_pinned_page {
struct rb_node node;
struct page *page; struct page *page;
u64 ipa; u64 ipa;
u64 __subtree_last;
u8 order; u8 order;
u16 pins; u16 pins;
}; };
#define KVM_DUMMY_PPAGE ((struct kvm_pinned_page *)-1) struct kvm_pinned_page
*kvm_pinned_pages_iter_first(struct rb_root_cached *root, u64 start, u64 end);
struct kvm_pinned_page
*kvm_pinned_pages_iter_next(struct kvm_pinned_page *ppage, u64 start, u64 end);
#define for_ppage_node_in_range(kvm, start, end, __ppage, __tmp) \
for (__ppage = kvm_pinned_pages_iter_first(&(kvm)->arch.pkvm.pinned_pages, start, end - 1);\
__ppage && ({ __tmp = kvm_pinned_pages_iter_next(__ppage, start, end - 1); 1; }); \
__ppage = __tmp)
void kvm_pinned_pages_remove(struct kvm_pinned_page *ppage,
struct rb_root_cached *root);
typedef unsigned int pkvm_handle_t; typedef unsigned int pkvm_handle_t;
struct kvm_protected_vm { struct kvm_protected_vm {
pkvm_handle_t handle; pkvm_handle_t handle;
struct kvm_hyp_memcache stage2_teardown_mc; struct kvm_hyp_memcache stage2_teardown_mc;
struct maple_tree pinned_pages; _ANDROID_KABI_REPLACE(struct maple_tree __unused, struct rb_root_cached pinned_pages);
gpa_t pvmfw_load_addr; gpa_t pvmfw_load_addr;
bool enabled; bool enabled;
}; };

View File

@@ -6,11 +6,11 @@
#include <linux/cma.h> #include <linux/cma.h>
#include <linux/dma-map-ops.h> #include <linux/dma-map-ops.h>
#include <linux/maple_tree.h>
#include <linux/mman.h> #include <linux/mman.h>
#include <linux/kvm_host.h> #include <linux/kvm_host.h>
#include <linux/io.h> #include <linux/io.h>
#include <linux/hugetlb.h> #include <linux/hugetlb.h>
#include <linux/interval_tree_generic.h>
#include <linux/sched/signal.h> #include <linux/sched/signal.h>
#include <trace/events/kvm.h> #include <trace/events/kvm.h>
#include <asm/pgalloc.h> #include <asm/pgalloc.h>
@@ -291,6 +291,20 @@ static void invalidate_icache_guest_page(void *va, size_t size)
__invalidate_icache_guest_page(va, size); __invalidate_icache_guest_page(va, size);
} }
static u64 __pinned_page_start(struct kvm_pinned_page *ppage)
{
return ppage->ipa;
}
static u64 __pinned_page_end(struct kvm_pinned_page *ppage)
{
return ppage->ipa + (1 << (ppage->order + PAGE_SHIFT)) - 1;
}
INTERVAL_TREE_DEFINE(struct kvm_pinned_page, node, u64, __subtree_last,
__pinned_page_start, __pinned_page_end, /* empty */,
kvm_pinned_pages);
static int __pkvm_unmap_guest_call(u64 pfn, u64 gfn, u8 order, void *args) static int __pkvm_unmap_guest_call(u64 pfn, u64 gfn, u8 order, void *args)
{ {
struct kvm *kvm = args; struct kvm *kvm = args;
@@ -312,7 +326,7 @@ static int pkvm_unmap_guest(struct kvm *kvm, struct kvm_pinned_page *ppage)
* no update needed from here. * no update needed from here.
*/ */
unpin_user_pages(&ppage->page, 1); unpin_user_pages(&ppage->page, 1);
mtree_erase(&kvm->arch.pkvm.pinned_pages, ppage->ipa); kvm_pinned_pages_remove(ppage, &kvm->arch.pkvm.pinned_pages);
kfree(ppage); kfree(ppage);
return 0; return 0;
@@ -320,17 +334,12 @@ static int pkvm_unmap_guest(struct kvm *kvm, struct kvm_pinned_page *ppage)
static int pkvm_unmap_range(struct kvm *kvm, u64 start, u64 end) static int pkvm_unmap_range(struct kvm *kvm, u64 start, u64 end)
{ {
struct kvm_pinned_page *ppage, *tmp;
struct mm_struct *mm = kvm->mm; struct mm_struct *mm = kvm->mm;
unsigned long index = start;
unsigned long cnt = 0; unsigned long cnt = 0;
void *entry;
int ret = 0; int ret = 0;
mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, end - 1) { for_ppage_node_in_range(kvm, start, end, ppage, tmp) {
struct kvm_pinned_page *ppage = entry;
if (ppage == KVM_DUMMY_PPAGE)
continue;
ret = pkvm_unmap_guest(kvm, ppage); ret = pkvm_unmap_guest(kvm, ppage);
if (ret) if (ret)
break; break;
@@ -418,8 +427,7 @@ static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 si
static void pkvm_stage2_flush(struct kvm *kvm) static void pkvm_stage2_flush(struct kvm *kvm)
{ {
unsigned long index = 0; struct kvm_pinned_page *ppage, *tmp;
void *entry;
/* /*
* Contrary to stage2_apply_range(), we don't need to check * Contrary to stage2_apply_range(), we don't need to check
@@ -427,11 +435,7 @@ static void pkvm_stage2_flush(struct kvm *kvm)
* from a vcpu thread, and the list is only ever freed on VM * from a vcpu thread, and the list is only ever freed on VM
* destroy (which only occurs when all vcpu are gone). * destroy (which only occurs when all vcpu are gone).
*/ */
mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, ULONG_MAX) { for_ppage_node_in_range(kvm, 0, ULONG_MAX, ppage, tmp) {
struct kvm_pinned_page *ppage = entry;
if (ppage == KVM_DUMMY_PPAGE)
continue;
__clean_dcache_guest_page(page_address(ppage->page), PAGE_SIZE); __clean_dcache_guest_page(page_address(ppage->page), PAGE_SIZE);
cond_resched_rwlock_write(&kvm->mmu_lock); cond_resched_rwlock_write(&kvm->mmu_lock);
} }
@@ -1014,7 +1018,6 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
mt_init_flags(&kvm->arch.pkvm.pinned_pages, MT_FLAGS_USE_RCU);
mmu->arch = &kvm->arch; mmu->arch = &kvm->arch;
if (is_protected_kvm_enabled()) if (is_protected_kvm_enabled())
@@ -1293,18 +1296,13 @@ static int __pkvm_wrprotect_call(u64 pfn, u64 gfn, u8 order, void *args)
static int pkvm_wp_range(struct kvm *kvm, u64 start, u64 end) static int pkvm_wp_range(struct kvm *kvm, u64 start, u64 end)
{ {
unsigned long index = start; struct kvm_pinned_page *ppage, *tmp;
void *entry;
mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, end - 1) { for_ppage_node_in_range(kvm, start, end, ppage, tmp) {
struct kvm_pinned_page *ppage = entry;
int ret; int ret;
if (ppage == KVM_DUMMY_PPAGE)
continue;
ret = pkvm_call_hyp_nvhe_ppage(ppage, __pkvm_wrprotect_call, ret = pkvm_call_hyp_nvhe_ppage(ppage, __pkvm_wrprotect_call,
kvm, false); kvm, false);
if (ret) if (ret)
return ret; return ret;
} }
@@ -1630,28 +1628,9 @@ static int pkvm_host_map_guest(u64 pfn, u64 gfn, u64 nr_pages,
return (ret == -EPERM) ? -EAGAIN : ret; return (ret == -EPERM) ? -EAGAIN : ret;
} }
static struct kvm_pinned_page *
find_ppage_or_above(struct kvm *kvm, phys_addr_t ipa)
{
unsigned long index = ipa;
void *entry;
mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, ULONG_MAX) {
if (entry == KVM_DUMMY_PPAGE)
continue;
return entry;
}
return NULL;
}
static struct kvm_pinned_page *find_ppage(struct kvm *kvm, u64 ipa) static struct kvm_pinned_page *find_ppage(struct kvm *kvm, u64 ipa)
{ {
struct kvm_pinned_page *ppage; return kvm_pinned_pages_iter_first(&kvm->arch.pkvm.pinned_pages, ipa, ipa + PAGE_SIZE - 1);
unsigned long index = ipa;
ppage = mt_find(&kvm->arch.pkvm.pinned_pages, &index, ipa + PAGE_SIZE - 1);
return ppage == KVM_DUMMY_PPAGE ? NULL : ppage;
} }
static int __pkvm_relax_perms_call(u64 pfn, u64 gfn, u8 order, void *args) static int __pkvm_relax_perms_call(u64 pfn, u64 gfn, u8 order, void *args)
@@ -1707,11 +1686,10 @@ static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t *fault_ipa,
{ {
unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE; unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE;
struct kvm_hyp_memcache *hyp_memcache = &vcpu->arch.stage2_mc; struct kvm_hyp_memcache *hyp_memcache = &vcpu->arch.stage2_mc;
unsigned long index, pmd_offset, page_size, end; unsigned long page_size = PAGE_SIZE;
struct mm_struct *mm = current->mm; struct mm_struct *mm = current->mm;
struct kvm_pinned_page *ppage; struct kvm_pinned_page *ppage;
struct kvm *kvm = vcpu->kvm; struct kvm *kvm = vcpu->kvm;
struct maple_tree *mt = &kvm->arch.pkvm.pinned_pages;
int ret, nr_pages; int ret, nr_pages;
struct page *page; struct page *page;
u64 pfn; u64 pfn;
@@ -1760,66 +1738,49 @@ static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t *fault_ipa,
} }
pfn = page_to_pfn(page); pfn = page_to_pfn(page);
pmd_offset = *fault_ipa & (PMD_SIZE - 1);
page_size = transparent_hugepage_adjust(kvm, memslot,
hva, &pfn,
fault_ipa);
page = pfn_to_page(pfn);
retry: read_lock(&kvm->mmu_lock);
if (size) if (!kvm_pinned_pages_iter_first(&vcpu->kvm->arch.pkvm.pinned_pages,
*size = page_size; ALIGN_DOWN(*fault_ipa, PMD_SIZE),
ALIGN(*fault_ipa + 1, PMD_SIZE) - 1))
page_size = transparent_hugepage_adjust(kvm, memslot, hva, &pfn, fault_ipa);
/*
* We take the risk of racing with another vCPU, but sync will be restored by the
* host_map_guest HVC
*/
read_unlock(&kvm->mmu_lock);
page = pfn_to_page(pfn);
ret = account_locked_vm(mm, page_size >> PAGE_SHIFT, true); ret = account_locked_vm(mm, page_size >> PAGE_SHIFT, true);
if (ret) if (ret)
goto unpin; goto unpin;
index = *fault_ipa;
end = index + page_size - 1;
ppage->page = page; ppage->page = page;
ppage->ipa = *fault_ipa; ppage->ipa = *fault_ipa;
ppage->order = get_order(page_size); ppage->order = get_order(page_size);
ppage->pins = 1 << ppage->order; ppage->pins = 1 << ppage->order;
/*
* If we already have a mapping in the middle of the THP, we have no
* other choice than enforcing PAGE_SIZE for pkvm_host_map_guest() to
* succeed.
*/
if (page_size > PAGE_SIZE && mt_find(mt, &index, end)) {
*fault_ipa += pmd_offset;
pfn += pmd_offset >> PAGE_SHIFT;
page = pfn_to_page(pfn);
account_locked_vm(mm, page_size >> PAGE_SHIFT, false);
page_size = PAGE_SIZE;
goto retry;
}
/* Reserve space in the mtree */
ret = mtree_insert_range(mt, index, end, KVM_DUMMY_PPAGE, GFP_KERNEL);
if (ret) {
if (ret == -EEXIST)
ret = 0;
goto dec_account;
}
write_lock(&kvm->mmu_lock); write_lock(&kvm->mmu_lock);
ret = pkvm_host_map_guest(pfn, *fault_ipa >> PAGE_SHIFT, ret = pkvm_host_map_guest(pfn, *fault_ipa >> PAGE_SHIFT,
page_size >> PAGE_SHIFT, KVM_PGTABLE_PROT_R); page_size >> PAGE_SHIFT, KVM_PGTABLE_PROT_R);
if (ret) { if (ret) {
if (WARN_ON(ret == -EAGAIN)) if (ret == -EAGAIN)
ret = 0; ret = 0;
goto err_unlock; goto err_unlock;
} }
WARN_ON(mtree_store_range(mt, index, end, ppage, GFP_ATOMIC)); kvm_pinned_pages_insert(ppage, &kvm->arch.pkvm.pinned_pages);
write_unlock(&kvm->mmu_lock); write_unlock(&kvm->mmu_lock);
if (size)
*size = page_size;
return 0; return 0;
err_unlock: err_unlock:
write_unlock(&kvm->mmu_lock); write_unlock(&kvm->mmu_lock);
dec_account:
account_locked_vm(mm, page_size >> PAGE_SHIFT, false); account_locked_vm(mm, page_size >> PAGE_SHIFT, false);
unpin: unpin:
unpin_user_pages(&page, 1); unpin_user_pages(&page, 1);
@@ -1847,13 +1808,13 @@ int pkvm_mem_abort_range(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, size_t si
idx = srcu_read_lock(&vcpu->kvm->srcu); idx = srcu_read_lock(&vcpu->kvm->srcu);
read_lock(&vcpu->kvm->mmu_lock); read_lock(&vcpu->kvm->mmu_lock);
ppage = find_ppage_or_above(vcpu->kvm, fault_ipa); ppage = kvm_pinned_pages_iter_first(&vcpu->kvm->arch.pkvm.pinned_pages,
fault_ipa, ipa_end);
while (fault_ipa < ipa_end) { while (fault_ipa < ipa_end) {
if (ppage && ppage != KVM_DUMMY_PPAGE && ppage->ipa == fault_ipa) { if (ppage && ppage->ipa == fault_ipa) {
page_size = PAGE_SIZE << ppage->order; page_size = PAGE_SIZE << ppage->order;
ppage = mt_next(&vcpu->kvm->arch.pkvm.pinned_pages, ppage = kvm_pinned_pages_iter_next(ppage, fault_ipa, ipa_end);
ppage->ipa, ULONG_MAX);
} else { } else {
gfn_t gfn = gpa_to_gfn(fault_ipa); gfn_t gfn = gpa_to_gfn(fault_ipa);
struct kvm_memory_slot *memslot; struct kvm_memory_slot *memslot;
@@ -1877,7 +1838,8 @@ int pkvm_mem_abort_range(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, size_t si
* We had to release the mmu_lock so let's update the * We had to release the mmu_lock so let's update the
* reference. * reference.
*/ */
ppage = find_ppage_or_above(vcpu->kvm, fault_ipa + page_size); ppage = kvm_pinned_pages_iter_first(&vcpu->kvm->arch.pkvm.pinned_pages,
fault_ipa + PAGE_SIZE, ipa_end);
} }
fault_ipa += page_size; fault_ipa += page_size;

View File

@@ -319,21 +319,17 @@ static int __reclaim_dying_guest_page_call(u64 pfn, u64 gfn, u8 order, void *arg
static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm) static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm)
{ {
struct kvm_pinned_page *tmp, *ppage;
struct mm_struct *mm = current->mm; struct mm_struct *mm = current->mm;
struct kvm_pinned_page *ppage;
struct kvm_vcpu *host_vcpu; struct kvm_vcpu *host_vcpu;
unsigned long idx, ipa = 0; unsigned long idx;
if (!host_kvm->arch.pkvm.handle) if (!host_kvm->arch.pkvm.handle)
goto out_free; goto out_free;
WARN_ON(kvm_call_hyp_nvhe(__pkvm_start_teardown_vm, host_kvm->arch.pkvm.handle)); WARN_ON(kvm_call_hyp_nvhe(__pkvm_start_teardown_vm, host_kvm->arch.pkvm.handle));
mt_clear_in_rcu(&host_kvm->arch.pkvm.pinned_pages); for_ppage_node_in_range(host_kvm, 0, ULONG_MAX, ppage, tmp) {
mt_for_each(&host_kvm->arch.pkvm.pinned_pages, ppage, ipa, ULONG_MAX) {
if (WARN_ON(ppage == KVM_DUMMY_PPAGE))
continue;
WARN_ON(pkvm_call_hyp_nvhe_ppage(ppage, WARN_ON(pkvm_call_hyp_nvhe_ppage(ppage,
__reclaim_dying_guest_page_call, __reclaim_dying_guest_page_call,
host_kvm, true)); host_kvm, true));
@@ -341,9 +337,9 @@ static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm)
account_locked_vm(mm, 1, false); account_locked_vm(mm, 1, false);
unpin_user_pages_dirty_lock(&ppage->page, 1, host_kvm->arch.pkvm.enabled); unpin_user_pages_dirty_lock(&ppage->page, 1, host_kvm->arch.pkvm.enabled);
kvm_pinned_pages_remove(ppage, &host_kvm->arch.pkvm.pinned_pages);
kfree(ppage); kfree(ppage);
} }
mtree_destroy(&host_kvm->arch.pkvm.pinned_pages);
WARN_ON(kvm_call_hyp_nvhe(__pkvm_finalize_teardown_vm, host_kvm->arch.pkvm.handle)); WARN_ON(kvm_call_hyp_nvhe(__pkvm_finalize_teardown_vm, host_kvm->arch.pkvm.handle));
@@ -538,13 +534,12 @@ void pkvm_host_reclaim_page(struct kvm *host_kvm, phys_addr_t ipa)
{ {
struct mm_struct *mm = current->mm; struct mm_struct *mm = current->mm;
struct kvm_pinned_page *ppage; struct kvm_pinned_page *ppage;
unsigned long index = ipa;
u16 pins; u16 pins;
write_lock(&host_kvm->mmu_lock); write_lock(&host_kvm->mmu_lock);
ppage = mt_find(&host_kvm->arch.pkvm.pinned_pages, &index, ppage = kvm_pinned_pages_iter_first(&host_kvm->arch.pkvm.pinned_pages,
index + PAGE_SIZE - 1); ipa, ipa + PAGE_SIZE - 1);
if (ppage && ppage != KVM_DUMMY_PPAGE) { if (ppage) {
if (ppage->pins) if (ppage->pins)
ppage->pins--; ppage->pins--;
else else
@@ -552,7 +547,8 @@ void pkvm_host_reclaim_page(struct kvm *host_kvm, phys_addr_t ipa)
pins = ppage->pins; pins = ppage->pins;
if (!pins) if (!pins)
mtree_erase(&host_kvm->arch.pkvm.pinned_pages, ipa); kvm_pinned_pages_remove(ppage,
&host_kvm->arch.pkvm.pinned_pages);
} }
write_unlock(&host_kvm->mmu_lock); write_unlock(&host_kvm->mmu_lock);