ANDROID: KVM: arm64: Convert kvm_pinned_pages to an interval-tree

Here we are, the last straw... the maple-tree is absolutely terrible for
what we need and it is time to get rid of it. With the upcoming set of
fixes for memory relinquish with huge-mappings, we need to be able to
split a pinned_page under the mmu write_lock. This is just too
complicated with the maple-tree while it is a piece of cake with an
interval tree.

Bug: 419548963
Change-Id: I981b5d875085e1d2f7b4ebe2560c8b6ea3cbae88
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
This commit is contained in:
Vincent Donnefort
2025-05-23 15:48:29 +01:00
parent 390699f93d
commit e56d181356
5 changed files with 111 additions and 105 deletions

View File

@@ -42469,6 +42469,10 @@ member {
id: 0x2c96db22
type_id: 0x3d751c99
}
member {
id: 0x2cd6acbf
type_id: 0x3c74c2ee
}
member {
id: 0x2d16b3a0
type_id: 0x3b74be91
@@ -43414,6 +43418,11 @@ member {
type_id: 0x57bf00b8
offset: 576
}
member {
id: 0x3643ba2c
type_id: 0x56209a0e
offset: 256
}
member {
id: 0x36477112
type_id: 0x5633b45b
@@ -45547,6 +45556,11 @@ member {
offset: 242
bitsize: 14
}
member {
id: 0x906218d7
name: "__unused"
type_id: 0xa179a8c5
}
member {
id: 0x9086e58b
name: "__unused"
@@ -156569,10 +156583,9 @@ member {
offset: 576
}
member {
id: 0x03347550
id: 0x0345ffe6
name: "pinned_pages"
type_id: 0xa179a8c5
offset: 256
type_id: 0xd0f3b5bf
}
member {
id: 0x88a7076f
@@ -223874,6 +223887,14 @@ struct_union {
member_id: 0xc101e64f
}
}
struct_union {
id: 0x3c74c2ee
kind: STRUCT
definition {
bytesize: 16
member_id: 0x906218d7
}
}
struct_union {
id: 0x3c9f0fa2
kind: STRUCT
@@ -225487,6 +225508,16 @@ struct_union {
member_id: 0xdf160d99
}
}
struct_union {
id: 0x56209a0e
kind: UNION
definition {
bytesize: 16
member_id: 0x0345ffe6
member_id: 0x2cd6acbf
member_id: 0x36752b74
}
}
struct_union {
id: 0x5633b45b
kind: UNION
@@ -253468,7 +253499,7 @@ struct_union {
bytesize: 64
member_id: 0xb8f5134f
member_id: 0x63c436ff
member_id: 0x03347550
member_id: 0x3643ba2c
member_id: 0x0f7f629e
member_id: 0x3a2d39cb
}

View File

@@ -132,3 +132,7 @@ type 'struct io_ring_ctx' changed
1 variable symbol(s) removed
'struct tracepoint __tracepoint_android_vh_filemap_fault_before_folio_locked'
type 'struct kvm_protected_vm' changed
member 'struct maple_tree pinned_pages' was removed
member 'union { struct rb_root_cached pinned_pages; struct { struct maple_tree __unused; }; union { }; }' was added

View File

@@ -224,20 +224,33 @@ struct kvm_smccc_features {
};
struct kvm_pinned_page {
struct rb_node node;
struct page *page;
u64 ipa;
u64 __subtree_last;
u8 order;
u16 pins;
};
#define KVM_DUMMY_PPAGE ((struct kvm_pinned_page *)-1)
struct kvm_pinned_page
*kvm_pinned_pages_iter_first(struct rb_root_cached *root, u64 start, u64 end);
struct kvm_pinned_page
*kvm_pinned_pages_iter_next(struct kvm_pinned_page *ppage, u64 start, u64 end);
#define for_ppage_node_in_range(kvm, start, end, __ppage, __tmp) \
for (__ppage = kvm_pinned_pages_iter_first(&(kvm)->arch.pkvm.pinned_pages, start, end - 1);\
__ppage && ({ __tmp = kvm_pinned_pages_iter_next(__ppage, start, end - 1); 1; }); \
__ppage = __tmp)
void kvm_pinned_pages_remove(struct kvm_pinned_page *ppage,
struct rb_root_cached *root);
typedef unsigned int pkvm_handle_t;
struct kvm_protected_vm {
pkvm_handle_t handle;
struct kvm_hyp_memcache stage2_teardown_mc;
struct maple_tree pinned_pages;
_ANDROID_KABI_REPLACE(struct maple_tree __unused, struct rb_root_cached pinned_pages);
gpa_t pvmfw_load_addr;
bool enabled;
};

View File

@@ -6,11 +6,11 @@
#include <linux/cma.h>
#include <linux/dma-map-ops.h>
#include <linux/maple_tree.h>
#include <linux/mman.h>
#include <linux/kvm_host.h>
#include <linux/io.h>
#include <linux/hugetlb.h>
#include <linux/interval_tree_generic.h>
#include <linux/sched/signal.h>
#include <trace/events/kvm.h>
#include <asm/pgalloc.h>
@@ -291,6 +291,20 @@ static void invalidate_icache_guest_page(void *va, size_t size)
__invalidate_icache_guest_page(va, size);
}
static u64 __pinned_page_start(struct kvm_pinned_page *ppage)
{
return ppage->ipa;
}
static u64 __pinned_page_end(struct kvm_pinned_page *ppage)
{
return ppage->ipa + (1 << (ppage->order + PAGE_SHIFT)) - 1;
}
INTERVAL_TREE_DEFINE(struct kvm_pinned_page, node, u64, __subtree_last,
__pinned_page_start, __pinned_page_end, /* empty */,
kvm_pinned_pages);
static int __pkvm_unmap_guest_call(u64 pfn, u64 gfn, u8 order, void *args)
{
struct kvm *kvm = args;
@@ -312,7 +326,7 @@ static int pkvm_unmap_guest(struct kvm *kvm, struct kvm_pinned_page *ppage)
* no update needed from here.
*/
unpin_user_pages(&ppage->page, 1);
mtree_erase(&kvm->arch.pkvm.pinned_pages, ppage->ipa);
kvm_pinned_pages_remove(ppage, &kvm->arch.pkvm.pinned_pages);
kfree(ppage);
return 0;
@@ -320,17 +334,12 @@ static int pkvm_unmap_guest(struct kvm *kvm, struct kvm_pinned_page *ppage)
static int pkvm_unmap_range(struct kvm *kvm, u64 start, u64 end)
{
struct kvm_pinned_page *ppage, *tmp;
struct mm_struct *mm = kvm->mm;
unsigned long index = start;
unsigned long cnt = 0;
void *entry;
int ret = 0;
mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, end - 1) {
struct kvm_pinned_page *ppage = entry;
if (ppage == KVM_DUMMY_PPAGE)
continue;
for_ppage_node_in_range(kvm, start, end, ppage, tmp) {
ret = pkvm_unmap_guest(kvm, ppage);
if (ret)
break;
@@ -418,8 +427,7 @@ static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 si
static void pkvm_stage2_flush(struct kvm *kvm)
{
unsigned long index = 0;
void *entry;
struct kvm_pinned_page *ppage, *tmp;
/*
* Contrary to stage2_apply_range(), we don't need to check
@@ -427,11 +435,7 @@ static void pkvm_stage2_flush(struct kvm *kvm)
* from a vcpu thread, and the list is only ever freed on VM
* destroy (which only occurs when all vcpu are gone).
*/
mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, ULONG_MAX) {
struct kvm_pinned_page *ppage = entry;
if (ppage == KVM_DUMMY_PPAGE)
continue;
for_ppage_node_in_range(kvm, 0, ULONG_MAX, ppage, tmp) {
__clean_dcache_guest_page(page_address(ppage->page), PAGE_SIZE);
cond_resched_rwlock_write(&kvm->mmu_lock);
}
@@ -1014,7 +1018,6 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
mt_init_flags(&kvm->arch.pkvm.pinned_pages, MT_FLAGS_USE_RCU);
mmu->arch = &kvm->arch;
if (is_protected_kvm_enabled())
@@ -1293,18 +1296,13 @@ static int __pkvm_wrprotect_call(u64 pfn, u64 gfn, u8 order, void *args)
static int pkvm_wp_range(struct kvm *kvm, u64 start, u64 end)
{
unsigned long index = start;
void *entry;
struct kvm_pinned_page *ppage, *tmp;
mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, end - 1) {
struct kvm_pinned_page *ppage = entry;
for_ppage_node_in_range(kvm, start, end, ppage, tmp) {
int ret;
if (ppage == KVM_DUMMY_PPAGE)
continue;
ret = pkvm_call_hyp_nvhe_ppage(ppage, __pkvm_wrprotect_call,
kvm, false);
if (ret)
return ret;
}
@@ -1630,28 +1628,9 @@ static int pkvm_host_map_guest(u64 pfn, u64 gfn, u64 nr_pages,
return (ret == -EPERM) ? -EAGAIN : ret;
}
static struct kvm_pinned_page *
find_ppage_or_above(struct kvm *kvm, phys_addr_t ipa)
{
unsigned long index = ipa;
void *entry;
mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, ULONG_MAX) {
if (entry == KVM_DUMMY_PPAGE)
continue;
return entry;
}
return NULL;
}
static struct kvm_pinned_page *find_ppage(struct kvm *kvm, u64 ipa)
{
struct kvm_pinned_page *ppage;
unsigned long index = ipa;
ppage = mt_find(&kvm->arch.pkvm.pinned_pages, &index, ipa + PAGE_SIZE - 1);
return ppage == KVM_DUMMY_PPAGE ? NULL : ppage;
return kvm_pinned_pages_iter_first(&kvm->arch.pkvm.pinned_pages, ipa, ipa + PAGE_SIZE - 1);
}
static int __pkvm_relax_perms_call(u64 pfn, u64 gfn, u8 order, void *args)
@@ -1707,11 +1686,10 @@ static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t *fault_ipa,
{
unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE;
struct kvm_hyp_memcache *hyp_memcache = &vcpu->arch.stage2_mc;
unsigned long index, pmd_offset, page_size, end;
unsigned long page_size = PAGE_SIZE;
struct mm_struct *mm = current->mm;
struct kvm_pinned_page *ppage;
struct kvm *kvm = vcpu->kvm;
struct maple_tree *mt = &kvm->arch.pkvm.pinned_pages;
int ret, nr_pages;
struct page *page;
u64 pfn;
@@ -1760,66 +1738,49 @@ static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t *fault_ipa,
}
pfn = page_to_pfn(page);
pmd_offset = *fault_ipa & (PMD_SIZE - 1);
page_size = transparent_hugepage_adjust(kvm, memslot,
hva, &pfn,
fault_ipa);
page = pfn_to_page(pfn);
retry:
if (size)
*size = page_size;
read_lock(&kvm->mmu_lock);
if (!kvm_pinned_pages_iter_first(&vcpu->kvm->arch.pkvm.pinned_pages,
ALIGN_DOWN(*fault_ipa, PMD_SIZE),
ALIGN(*fault_ipa + 1, PMD_SIZE) - 1))
page_size = transparent_hugepage_adjust(kvm, memslot, hva, &pfn, fault_ipa);
/*
* We take the risk of racing with another vCPU, but sync will be restored by the
* host_map_guest HVC
*/
read_unlock(&kvm->mmu_lock);
page = pfn_to_page(pfn);
ret = account_locked_vm(mm, page_size >> PAGE_SHIFT, true);
if (ret)
goto unpin;
index = *fault_ipa;
end = index + page_size - 1;
ppage->page = page;
ppage->ipa = *fault_ipa;
ppage->order = get_order(page_size);
ppage->pins = 1 << ppage->order;
/*
* If we already have a mapping in the middle of the THP, we have no
* other choice than enforcing PAGE_SIZE for pkvm_host_map_guest() to
* succeed.
*/
if (page_size > PAGE_SIZE && mt_find(mt, &index, end)) {
*fault_ipa += pmd_offset;
pfn += pmd_offset >> PAGE_SHIFT;
page = pfn_to_page(pfn);
account_locked_vm(mm, page_size >> PAGE_SHIFT, false);
page_size = PAGE_SIZE;
goto retry;
}
/* Reserve space in the mtree */
ret = mtree_insert_range(mt, index, end, KVM_DUMMY_PPAGE, GFP_KERNEL);
if (ret) {
if (ret == -EEXIST)
ret = 0;
goto dec_account;
}
write_lock(&kvm->mmu_lock);
ret = pkvm_host_map_guest(pfn, *fault_ipa >> PAGE_SHIFT,
page_size >> PAGE_SHIFT, KVM_PGTABLE_PROT_R);
if (ret) {
if (WARN_ON(ret == -EAGAIN))
if (ret == -EAGAIN)
ret = 0;
goto err_unlock;
}
WARN_ON(mtree_store_range(mt, index, end, ppage, GFP_ATOMIC));
kvm_pinned_pages_insert(ppage, &kvm->arch.pkvm.pinned_pages);
write_unlock(&kvm->mmu_lock);
if (size)
*size = page_size;
return 0;
err_unlock:
write_unlock(&kvm->mmu_lock);
dec_account:
account_locked_vm(mm, page_size >> PAGE_SHIFT, false);
unpin:
unpin_user_pages(&page, 1);
@@ -1847,13 +1808,13 @@ int pkvm_mem_abort_range(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, size_t si
idx = srcu_read_lock(&vcpu->kvm->srcu);
read_lock(&vcpu->kvm->mmu_lock);
ppage = find_ppage_or_above(vcpu->kvm, fault_ipa);
ppage = kvm_pinned_pages_iter_first(&vcpu->kvm->arch.pkvm.pinned_pages,
fault_ipa, ipa_end);
while (fault_ipa < ipa_end) {
if (ppage && ppage != KVM_DUMMY_PPAGE && ppage->ipa == fault_ipa) {
if (ppage && ppage->ipa == fault_ipa) {
page_size = PAGE_SIZE << ppage->order;
ppage = mt_next(&vcpu->kvm->arch.pkvm.pinned_pages,
ppage->ipa, ULONG_MAX);
ppage = kvm_pinned_pages_iter_next(ppage, fault_ipa, ipa_end);
} else {
gfn_t gfn = gpa_to_gfn(fault_ipa);
struct kvm_memory_slot *memslot;
@@ -1877,7 +1838,8 @@ int pkvm_mem_abort_range(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, size_t si
* We had to release the mmu_lock so let's update the
* reference.
*/
ppage = find_ppage_or_above(vcpu->kvm, fault_ipa + page_size);
ppage = kvm_pinned_pages_iter_first(&vcpu->kvm->arch.pkvm.pinned_pages,
fault_ipa + PAGE_SIZE, ipa_end);
}
fault_ipa += page_size;

View File

@@ -319,21 +319,17 @@ static int __reclaim_dying_guest_page_call(u64 pfn, u64 gfn, u8 order, void *arg
static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm)
{
struct kvm_pinned_page *tmp, *ppage;
struct mm_struct *mm = current->mm;
struct kvm_pinned_page *ppage;
struct kvm_vcpu *host_vcpu;
unsigned long idx, ipa = 0;
unsigned long idx;
if (!host_kvm->arch.pkvm.handle)
goto out_free;
WARN_ON(kvm_call_hyp_nvhe(__pkvm_start_teardown_vm, host_kvm->arch.pkvm.handle));
mt_clear_in_rcu(&host_kvm->arch.pkvm.pinned_pages);
mt_for_each(&host_kvm->arch.pkvm.pinned_pages, ppage, ipa, ULONG_MAX) {
if (WARN_ON(ppage == KVM_DUMMY_PPAGE))
continue;
for_ppage_node_in_range(host_kvm, 0, ULONG_MAX, ppage, tmp) {
WARN_ON(pkvm_call_hyp_nvhe_ppage(ppage,
__reclaim_dying_guest_page_call,
host_kvm, true));
@@ -341,9 +337,9 @@ static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm)
account_locked_vm(mm, 1, false);
unpin_user_pages_dirty_lock(&ppage->page, 1, host_kvm->arch.pkvm.enabled);
kvm_pinned_pages_remove(ppage, &host_kvm->arch.pkvm.pinned_pages);
kfree(ppage);
}
mtree_destroy(&host_kvm->arch.pkvm.pinned_pages);
WARN_ON(kvm_call_hyp_nvhe(__pkvm_finalize_teardown_vm, host_kvm->arch.pkvm.handle));
@@ -538,13 +534,12 @@ void pkvm_host_reclaim_page(struct kvm *host_kvm, phys_addr_t ipa)
{
struct mm_struct *mm = current->mm;
struct kvm_pinned_page *ppage;
unsigned long index = ipa;
u16 pins;
write_lock(&host_kvm->mmu_lock);
ppage = mt_find(&host_kvm->arch.pkvm.pinned_pages, &index,
index + PAGE_SIZE - 1);
if (ppage && ppage != KVM_DUMMY_PPAGE) {
ppage = kvm_pinned_pages_iter_first(&host_kvm->arch.pkvm.pinned_pages,
ipa, ipa + PAGE_SIZE - 1);
if (ppage) {
if (ppage->pins)
ppage->pins--;
else
@@ -552,7 +547,8 @@ void pkvm_host_reclaim_page(struct kvm *host_kvm, phys_addr_t ipa)
pins = ppage->pins;
if (!pins)
mtree_erase(&host_kvm->arch.pkvm.pinned_pages, ipa);
kvm_pinned_pages_remove(ppage,
&host_kvm->arch.pkvm.pinned_pages);
}
write_unlock(&host_kvm->mmu_lock);