From 56cc224601caca24f778f1b38a4a1b09a1b31a8f Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 23 Jun 2025 13:39:53 +0000 Subject: [PATCH 1/9] ANDROID: BACKPORT: KVM: arm64: Always unmap the pvmfw region at stage-2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The donation of the pvmfw region to pKVM is currently done transparently as part of fix_host_ownership(). However, this function only runs over PA ranges covered by the memblock list, although there is no guarantee for the pvmfw region to be advertised in a memory node in DT. In this case, the pKVM init will appear to succeed while silently keeping valid host stage-2 mappings to the pvmfw region. Fix this by forcefully registering the pvmfw region in the pKVM memblock list. BACKPORT: Fix usage of pvmfw_size and pvmfw_base which are pointers in 6.6 and earlier. Bug: 278749606 Bug: 424382332 Reported-by: Bartłomiej Grzesik Suggested-by: Will Deacon Change-Id: I8f5498df25debb432b7dffd1e40a8910bcec7b49 Signed-off-by: Quentin Perret --- arch/arm64/kvm/pkvm.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index 4523cc6f2725..61b524129366 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -65,6 +65,7 @@ static void __init sort_memblock_regions(void) static int __init register_memblock_regions(void) { struct memblock_region *reg; + bool pvmfw_in_mem = false; for_each_mem_region(reg) { if (*hyp_memblock_nr_ptr >= HYP_MEMBLOCK_REGIONS) @@ -72,6 +73,27 @@ static int __init register_memblock_regions(void) hyp_memory[*hyp_memblock_nr_ptr] = *reg; (*hyp_memblock_nr_ptr)++; + + if (!*pvmfw_size || pvmfw_in_mem || + !memblock_addrs_overlap(reg->base, reg->size, *pvmfw_base, *pvmfw_size)) + continue; + /* If the pvmfw region overlaps a memblock, it must be a subset */ + if (*pvmfw_base < reg->base || + (*pvmfw_base + *pvmfw_size) > (reg->base + reg->size)) + return -EINVAL; + pvmfw_in_mem = true; + } + + if (*pvmfw_size && !pvmfw_in_mem) { + if (*hyp_memblock_nr_ptr >= HYP_MEMBLOCK_REGIONS) + return -ENOMEM; + + hyp_memory[*hyp_memblock_nr_ptr] = (struct memblock_region) { + .base = *pvmfw_base, + .size = *pvmfw_size, + .flags = MEMBLOCK_NOMAP, + }; + (*hyp_memblock_nr_ptr)++; } sort_memblock_regions(); From 390d8897c3243bbb3b122a08b4d4b6e4bd5e796d Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Wed, 25 Jun 2025 20:10:29 +0800 Subject: [PATCH 2/9] ANDROID: vendor hooks: Add new android_rvh for adjust water mark The trace_android_vh_alloc_pages_adjust_wmark() and trace_android_vh_alloc_pages_reset_wmark() have been deprecated, because they cannot be used in a CPU offline or non-atomic context, the trace_android_rvh_alloc_pages_adjust_wmark() and trace_android_rvh_alloc_pages_reset_wmark() should be used instead. Bug: 427378244 Change-Id: I641a4bb5548120686a67a56067648b4e23b2f0e1 Signed-off-by: Qianfeng Rong --- drivers/android/vendor_hooks.c | 2 ++ include/trace/hooks/mm.h | 9 +++++++++ mm/page_alloc.c | 16 +++++++++++++++- 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/drivers/android/vendor_hooks.c b/drivers/android/vendor_hooks.c index eaea41831d1f..d3f7ff4fde56 100644 --- a/drivers/android/vendor_hooks.c +++ b/drivers/android/vendor_hooks.c @@ -463,6 +463,8 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_pages_reclaim_bypass); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_pages_failure_bypass); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_pages_adjust_wmark); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_pages_reset_wmark); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_alloc_pages_adjust_wmark); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_alloc_pages_reset_wmark); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_watermark_fast_ok); EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_fiq_dump); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_swapmem_gather_init); diff --git a/include/trace/hooks/mm.h b/include/trace/hooks/mm.h index 00df4c5ea263..65eb40c00944 100644 --- a/include/trace/hooks/mm.h +++ b/include/trace/hooks/mm.h @@ -156,6 +156,15 @@ DECLARE_HOOK(android_vh_alloc_pages_reset_wmark, unsigned long direct_reclaim_retries), TP_ARGS(gfp_mask, order, alloc_flags, did_some_progress, no_progress_loops, direct_reclaim_retries)); +DECLARE_RESTRICTED_HOOK(android_rvh_alloc_pages_adjust_wmark, + TP_PROTO(gfp_t gfp_mask, int order, int *alloc_flags), + TP_ARGS(gfp_mask, order, alloc_flags), 3); +DECLARE_RESTRICTED_HOOK(android_rvh_alloc_pages_reset_wmark, + TP_PROTO(gfp_t gfp_mask, int order, int *alloc_flags, + unsigned long *did_some_progress, int *no_progress_loops, + unsigned long direct_reclaim_retries), + TP_ARGS(gfp_mask, order, alloc_flags, did_some_progress, + no_progress_loops, direct_reclaim_retries), 6); DECLARE_HOOK(android_vh_unreserve_highatomic_bypass, TP_PROTO(bool force, struct zone *zone, bool *skip_unreserve_highatomic), TP_ARGS(force, zone, skip_unreserve_highatomic)); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b42afcd0d3c3..152b0424fcbf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4438,8 +4438,15 @@ restart: if (alloc_flags & ALLOC_KSWAPD) wake_all_kswapds(order, gfp_mask, ac); - if (can_direct_reclaim && !direct_reclaim_retries && !(current->flags & PF_MEMALLOC)) + if (can_direct_reclaim && !direct_reclaim_retries && !(current->flags & PF_MEMALLOC)) { + /* + * The trace_android_vh_alloc_pages_adjust_wmark() has been deprecated + * because it cannot be used in a CPU offline or non-atomic context, + * please use trace_android_rvh_alloc_pages_adjust_wmark(). + */ trace_android_vh_alloc_pages_adjust_wmark(gfp_mask, order, &alloc_flags); + trace_android_rvh_alloc_pages_adjust_wmark(gfp_mask, order, &alloc_flags); + } /* * The adjusted alloc_flags might result in immediate success, so try @@ -4587,8 +4594,15 @@ retry: !(gfp_mask & __GFP_RETRY_MAYFAIL))) goto nopage; + /* + * The trace_android_vh_alloc_pages_reset_wmark() has been deprecated + * because it cannot be used in a CPU offline or non-atomic context, + * please use trace_android_rvh_alloc_pages_reset_wmark(). + */ trace_android_vh_alloc_pages_reset_wmark(gfp_mask, order, &alloc_flags, &did_some_progress, &no_progress_loops, direct_reclaim_retries); + trace_android_rvh_alloc_pages_reset_wmark(gfp_mask, order, + &alloc_flags, &did_some_progress, &no_progress_loops, direct_reclaim_retries); if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, did_some_progress > 0, &no_progress_loops)) From 1c1f2b75268a960070edd1c8c43da0214b154b0f Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Wed, 25 Jun 2025 20:22:58 +0800 Subject: [PATCH 3/9] ANDROID: GKI: vivo add symbols to symbol list 2 function symbol(s) added 'int __traceiter_android_rvh_alloc_pages_adjust_wmark(void*, gfp_t, int, int*)' 'int __traceiter_android_rvh_alloc_pages_reset_wmark(void*, gfp_t, int, int*, unsigned long*, int*, unsigned long)' 2 variable symbol(s) added 'struct tracepoint __tracepoint_android_rvh_alloc_pages_adjust_wmark' 'struct tracepoint __tracepoint_android_rvh_alloc_pages_reset_wmark' Bug: 427378244 Change-Id: I2929065b78ae40226c7da679eadd898259e4b9e7 Signed-off-by: Qianfeng Rong --- android/abi_gki_aarch64.stg | 40 ++++++++++++++++++++++++++++++++++++ android/abi_gki_aarch64_vivo | 4 ++++ 2 files changed, 44 insertions(+) diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg index 0de7176f5e86..d8ed1e76d857 100644 --- a/android/abi_gki_aarch64.stg +++ b/android/abi_gki_aarch64.stg @@ -359430,6 +359430,24 @@ elf_symbol { type_id: 0x9bc8472e full_name: "__traceiter_android_rvh_alloc_and_link_pwqs" } +elf_symbol { + id: 0xc0fd1a1f + name: "__traceiter_android_rvh_alloc_pages_adjust_wmark" + is_defined: true + symbol_type: FUNCTION + crc: 0x6a18478a + type_id: 0x9870a448 + full_name: "__traceiter_android_rvh_alloc_pages_adjust_wmark" +} +elf_symbol { + id: 0x6eed3175 + name: "__traceiter_android_rvh_alloc_pages_reset_wmark" + is_defined: true + symbol_type: FUNCTION + crc: 0x414d4c97 + type_id: 0x9870a59a + full_name: "__traceiter_android_rvh_alloc_pages_reset_wmark" +} elf_symbol { id: 0xef79dd4d name: "__traceiter_android_rvh_alloc_workqueue" @@ -366522,6 +366540,24 @@ elf_symbol { type_id: 0x18ccbd2c full_name: "__tracepoint_android_rvh_alloc_and_link_pwqs" } +elf_symbol { + id: 0x89ff3495 + name: "__tracepoint_android_rvh_alloc_pages_adjust_wmark" + is_defined: true + symbol_type: OBJECT + crc: 0xc63d662f + type_id: 0x18ccbd2c + full_name: "__tracepoint_android_rvh_alloc_pages_adjust_wmark" +} +elf_symbol { + id: 0xab6e1e0f + name: "__tracepoint_android_rvh_alloc_pages_reset_wmark" + is_defined: true + symbol_type: OBJECT + crc: 0xdbce1a35 + type_id: 0x18ccbd2c + full_name: "__tracepoint_android_rvh_alloc_pages_reset_wmark" +} elf_symbol { id: 0x0b219d2b name: "__tracepoint_android_rvh_alloc_workqueue" @@ -436771,6 +436807,8 @@ interface { symbol_id: 0xb42422d5 symbol_id: 0xb3d70eab symbol_id: 0x9ca1a40f + symbol_id: 0xc0fd1a1f + symbol_id: 0x6eed3175 symbol_id: 0xef79dd4d symbol_id: 0x0b48afa1 symbol_id: 0xa927338c @@ -437559,6 +437597,8 @@ interface { symbol_id: 0x4b7a8fd7 symbol_id: 0xcd36f539 symbol_id: 0x33f0c37d + symbol_id: 0x89ff3495 + symbol_id: 0xab6e1e0f symbol_id: 0x0b219d2b symbol_id: 0x748c1fd7 symbol_id: 0xcb42202e diff --git a/android/abi_gki_aarch64_vivo b/android/abi_gki_aarch64_vivo index b8f2d60402fd..093e3588d263 100644 --- a/android/abi_gki_aarch64_vivo +++ b/android/abi_gki_aarch64_vivo @@ -108,9 +108,11 @@ __traceiter_android_vh_account_process_tick_gran __traceiter_android_vh_adjust_kvmalloc_flags __traceiter_android_vh_alloc_pages_adjust_wmark + __traceiter_android_rvh_alloc_pages_adjust_wmark __traceiter_android_vh_alloc_pages_failure_bypass __traceiter_android_vh_alloc_pages_reclaim_bypass __traceiter_android_vh_alloc_pages_reset_wmark + __traceiter_android_rvh_alloc_pages_reset_wmark __traceiter_android_vh_alter_mutex_list_add __traceiter_android_vh_alter_rwsem_list_add __traceiter_android_vh_bd_link_disk_holder @@ -241,9 +243,11 @@ __tracepoint_android_vh_account_process_tick_gran __tracepoint_android_vh_adjust_kvmalloc_flags __tracepoint_android_vh_alloc_pages_adjust_wmark + __tracepoint_android_rvh_alloc_pages_adjust_wmark __tracepoint_android_vh_alloc_pages_failure_bypass __tracepoint_android_vh_alloc_pages_reclaim_bypass __tracepoint_android_vh_alloc_pages_reset_wmark + __tracepoint_android_rvh_alloc_pages_reset_wmark __tracepoint_android_vh_alter_mutex_list_add __tracepoint_android_vh_alter_rwsem_list_add __tracepoint_android_vh_bd_link_disk_holder From fe3caa5756634542c9d65436ad7c196fac7edcde Mon Sep 17 00:00:00 2001 From: Juan Yescas Date: Wed, 25 Jun 2025 17:53:01 -0700 Subject: [PATCH 4/9] ANDROID: mm: Set __GFP_CMA in do_swap_page() for folio allocations In the do_swap_page() path, the memory allocations were failing even if there were free CMA pages. The allocations were not fallbacking to CMA bucket. This was due the requested folios were not marked as __GFP_CMA and as a consequence, the ALLOC_CMA was not set. ``` static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask, unsigned int alloc_flags) { /* * If cma_redirect_restricted is true, set ALLOC_CMA only for * movable allocations that have __GFP_CMA. */ if ((!cma_redirect_restricted() || gfp_mask & __GFP_CMA) && gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; trace_android_vh_alloc_flags_cma_adjust(gfp_mask, &alloc_flags); return alloc_flags; } ``` This was introduced in the change I9d16a9cae1c6c0f6cdb03183038fab095843001e ("BACKPORT: mm: support large folios swap-in for sync io devices") Bug: 427802573 Bug: 425779146 Bug: 422586344 Bug: 313807618 Test: Built and run kernel Change-Id: Ied33777bb04198f1e4a69b91f002ae70d0471bb3 Fixes: 988dc02cddcb ("BACKPORT: mm: support large folios swap-in for sync io devices") Signed-off-by: Juan Yescas --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index dfbd0a2795db..a04841dc9291 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3962,7 +3962,7 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf) struct folio *folio; swp_entry_t entry; - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE|__GFP_CMA, 0, vma, vmf->address, false); if (!folio) return NULL; From 87cddfadcdd3614e5d3123bbfa55683d8389a8b8 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 5 Feb 2025 22:15:14 -0800 Subject: [PATCH 5/9] BACKPORT: mm/madvise: split out mmap locking operations for madvise() Patch series "mm/madvise: remove redundant mmap_lock operations from process_madvise()". process_madvise() calls do_madvise() for each address range. Then, each do_madvise() invocation holds and releases same mmap_lock. Optimize the redundant lock operations by splitting do_madvise() internal logic including the mmap_lock operations, and calling the small logic directly from process_madvise() in a sequence that removes the redundant locking. As a result of this change, process_madvise() becomes more efficient and less racy in terms of its results and latency. Note that the potential downside of this series is that other mmap_lock holders may take more time due to the increased length of mmap_lock critical section for process_madvise() calls. But there is maximum limit in the kernel space (IOV_MAX), and userspace can control the critical section length by setting the request size. Hence, the downside would be limited and controllable. Evaluation ========== I measured the time to apply MADV_DONTNEED advice to 256 MiB memory using multiple madvise() calls, 4 KiB per each call. I also do the same with process_madvise(), but with varying batch size (vlen) from 1 to 1024. The source code for the measurement is available at GitHub[1]. Because the microbenchmark result is not that stable, I ran each configuration five times and use the average. The measurement results are as below. 'sz_batches' column shows the batch size of process_madvise() calls. '0' batch size is for madvise() calls case. 'before' and 'after' columns are the measured time to apply MADV_DONTNEED to the 256 MiB memory buffer in nanoseconds, on kernels that built without and with the last patch of this series, respectively. So lower value means better efficiency. 'after/before' column is the ratio of 'after' to 'before'. sz_batches before after after/before 0 146294215.2 121280536.2 0.829017989769427 1 165851018.8 136305598.2 0.821855658085351 2 129469321.2 103740383.6 0.801273866569094 4 110369232.4 87835896.2 0.795836795182785 8 102906232.4 77420920.2 0.752344327397609 16 97551017.4 74959714.4 0.768415506038587 32 94809848.2 71200848.4 0.750985786305689 64 96087575.6 72593180 0.755489765942227 128 96154163.8 68517055.4 0.712575022154163 256 92901257.6 69054216.6 0.743307662177439 512 93646170.8 67053296.2 0.716028168874151 1024 92663219.2 70168196.8 0.75723892830177 Despite the unstable nature of the test program, the trend is as we expect. The measurement shows this patchset reduces the process_madvise() latency, proportional to the batching size. The latency gain was about 20% with the batch size 2, and it has increased to about 28% with the batch size 512, since more number of mmap locking is reduced with larger batch size. Note that the standard devitation of the measurements for each sz_batches configuration ranged from 1.9% to 7.2%. That is, this result is not very stable. The average of the standard deviations for different batch sizes were 4.62% and 4.70% for the 'before' and 'after' kernel measurements. Also note that this patch has somehow decreased latencies of madvise() and single batch size process_madvise(). Seems this code path is small enough to significantly be affected by compiler optimizations including inlining of split-out functions. Please focus on only the improvement amount that changed by the batch size. [1] https://github.com/sjp38/eval_proc_madvise This patch (of 4): Split out the madvise behavior-dependent mmap_lock operations from do_madvise(), for easier reuse of the logic in an upcoming change. [lorenzo.stoakes@oracle.com: fix madvise_[un]lock() issue] Link: https://lkml.kernel.org/r/2f448f7b-1da7-4099-aa9e-0179d47fde40@lucifer.local [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20250206061517.2958-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250206061517.2958-2-sj@kernel.org Signed-off-by: SeongJae Park Reviewed-by: Shakeel Butt Reviewed-by: Lorenzo Stoakes Reviewed-by: Davidlohr Bueso Reviewed-by: Liam R. Howlett Cc: David Hildenbrand Cc: SeongJae Park Cc: Vlastimil Babka Signed-off-by: Andrew Morton Bug: 425827225 Change-Id: Ic87850b33b47049d65a07270a37616b6e829d7ee (cherry picked from commit 4cc39f91ef6c6f876651eb231974a59ffbcb3a21 https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-unstable) Signed-off-by: Oven --- mm/madvise.c | 62 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 13 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index b36a6a32a1e3..d63c162157fa 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1415,6 +1415,50 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, madvise_vma_anon_name); } #endif /* CONFIG_ANON_VMA_NAME */ + +#ifdef CONFIG_MEMORY_FAILURE +static bool is_memory_failure(int behavior) +{ + switch (behavior) { + case MADV_HWPOISON: + case MADV_SOFT_OFFLINE: + return true; + default: + return false; + } +} +#else +static bool is_memory_failure(int behavior) +{ + return false; +} +#endif + +static int madvise_lock(struct mm_struct *mm, int behavior) +{ + if (is_memory_failure(behavior)) + return 0; + + if (madvise_need_mmap_write(behavior)) { + if (mmap_write_lock_killable(mm)) + return -EINTR; + } else { + mmap_read_lock(mm); + } + return 0; +} + +static void madvise_unlock(struct mm_struct *mm, int behavior) +{ + if (is_memory_failure(behavior)) + return; + + if (madvise_need_mmap_write(behavior)) + mmap_write_unlock(mm); + else + mmap_read_unlock(mm); +} + /* * The madvise(2) system call. * @@ -1491,7 +1535,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh { unsigned long end; int error; - int write; size_t len; struct blk_plug plug; @@ -1513,19 +1556,15 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh if (end == start) return 0; + error = madvise_lock(mm, behavior); + if (error) + return error; + #ifdef CONFIG_MEMORY_FAILURE if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) return madvise_inject_error(behavior, start, start + len_in); #endif - write = madvise_need_mmap_write(behavior); - if (write) { - if (mmap_write_lock_killable(mm)) - return -EINTR; - } else { - mmap_read_lock(mm); - } - start = untagged_addr_remote(mm, start); end = start + len; @@ -1534,10 +1573,7 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh madvise_vma_behavior); blk_finish_plug(&plug); - if (write) - mmap_write_unlock(mm); - else - mmap_read_unlock(mm); + madvise_unlock(mm, behavior); return error; } From 79dea4ada64b388354fa0e68e6ae61b0a0e2e08d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 9 Apr 2025 17:00:19 -0700 Subject: [PATCH 6/9] BACKPORT: mm/madvise: define and use madvise_behavior struct for madvise_do_behavior() Patch series "mm/madvise: batch tlb flushes for MADV_DONTNEED and MADV_FREE", v3. When process_madvise() is called to do MADV_DONTNEED[_LOCKED] or MADV_FREE with multiple address ranges, tlb flushes happen for each of the given address ranges. Because such tlb flushes are for the same process, doing those in a batch is more efficient while still being safe. Modify process_madvise() entry level code path to do such batched tlb flushes, while the internal unmap logic do only gathering of the tlb entries to flush. In more detail, modify the entry functions to initialize an mmu_gather object and pass it to the internal logic. And make the internal logic do only gathering of the tlb entries to flush into the received mmu_gather object. After all internal function calls are done, the entry functions flush the gathered tlb entries at once. Because process_madvise() and madvise() share the internal unmap logic, make same change to madvise() entry code together, to make code consistent and cleaner. It is only for keeping the code clean, and shouldn't degrade madvise(). It could rather provide a potential tlb flushes reduction benefit for a case that there are multiple vmas for the given address range. It is only a side effect from an effort to keep code clean, so we don't measure it separately. Similar optimizations might be applicable to other madvise behavior such as MADV_COLD and MADV_PAGEOUT. Those are simply out of the scope of this patch series, though. Patches Sequence ================ The first patch defines a new data structure for managing information that is required for batched tlb flushes (mmu_gather and behavior), and update code paths for MADV_DONTNEED[_LOCKED] and MADV_FREE handling internal logic to receive it. The second patch batches tlb flushes for MADV_FREE handling for both madvise() and process_madvise(). Remaining two patches are for MADV_DONTNEED[_LOCKED] tlb flushes batching. The third patch splits zap_page_range_single() for batching of MADV_DONTNEED[_LOCKED] handling. The fourth patch batches tlb flushes for the hint using the sub-logic that the third patch split out, and the helpers for batched tlb flushes that introduced for the MADV_FREE case, by the second patch. Test Results ============ I measured the latency to apply MADV_DONTNEED advice to 256 MiB memory using multiple process_madvise() calls. I apply the advice in 4 KiB sized regions granularity, but with varying batch size per process_madvise() call (vlen) from 1 to 1024. The source code for the measurement is available at GitHub[1]. To reduce measurement errors, I did the measurement five times. The measurement results are as below. 'sz_batch' column shows the batch size of process_madvise() calls. 'Before' and 'After' columns show the average of latencies in nanoseconds that measured five times on kernels that built without and with the tlb flushes batching of this series (patches 3 and 4), respectively. For the baseline, mm-new tree of 2025-04-09[2] has been used, after reverting the second version of this patch series and adding a temporal fix for !CONFIG_DEBUG_VM build failure[3]. 'B-stdev' and 'A-stdev' columns show ratios of latency measurements standard deviation to average in percent for 'Before' and 'After', respectively. 'Latency_reduction' shows the reduction of the latency that the 'After' has achieved compared to 'Before', in percent. Higher 'Latency_reduction' values mean more efficiency improvements. sz_batch Before B-stdev After A-stdev Latency_reduction 1 146386348 2.78 111327360.6 3.13 23.95 2 108222130 1.54 72131173.6 2.39 33.35 4 93617846.8 2.76 51859294.4 2.50 44.61 8 80555150.4 2.38 44328790 1.58 44.97 16 77272777 1.62 37489433.2 1.16 51.48 32 76478465.2 2.75 33570506 3.48 56.10 64 75810266.6 1.15 27037652.6 1.61 64.34 128 73222748 3.86 25517629.4 3.30 65.15 256 72534970.8 2.31 25002180.4 0.94 65.53 512 71809392 5.12 24152285.4 2.41 66.37 1024 73281170.2 4.53 24183615 2.09 67.00 Unexpectedly the latency has reduced (improved) even with batch size one. I think some of compiler optimizations have affected that, like also observed with the first version of this patch series. So, please focus on the proportion between the improvement and the batch size. As expected, tlb flushes batching provides latency reduction that proportional to the batch size. The efficiency gain ranges from about 33 percent with batch size 2, and up to 67 percent with batch size 1,024. Please note that this is a very simple microbenchmark, so real efficiency gain on real workload could be very different. This patch (of 4): To implement batched tlb flushes for MADV_DONTNEED[_LOCKED] and MADV_FREE, an mmu_gather object in addition to the behavior integer need to be passed to the internal logics. Using a struct can make it easy without increasing the number of parameters of all code paths towards the internal logic. Define a struct for the purpose and use it on the code path that starts from madvise_do_behavior() and ends on madvise_dontneed_free(). Note that this changes madvise_walk_vmas() visitor type signature, too. Specifically, it changes its 'arg' type from 'unsigned long' to the new struct pointer. Link: https://lkml.kernel.org/r/20250410000022.1901-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250410000022.1901-2-sj@kernel.org Signed-off-by: SeongJae Park Reviewed-by: Lorenzo Stoakes Cc: David Hildenbrand Cc: Liam R. Howlett Cc: Rik van Riel Cc: SeongJae Park Cc: Shakeel Butt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Bug: 425827225 Change-Id: I4e37381a88f0f606f7fc07fc63af08c3692e0886 (cherry picked from commit 066c770437835d2bd2072bd2c88a71fcbbd5ccb3 https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-unstable) [oven: Moved changes in madvise_do_behavior to do_madvise and removed changes in vector_madvise. Because these functions haven't been introduced in old kernel.] Signed-off-by: Oven --- mm/madvise.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index d63c162157fa..0b8897b7071a 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -48,6 +48,11 @@ struct madvise_walk_private { void *private; }; +struct madvise_behavior { + int behavior; + struct mmu_gather *tlb; +}; + /* * Any behaviour which results in changes to the vma->vm_flags needs to * take mmap_lock for writing. Others, which simply traverse vmas, need @@ -941,8 +946,9 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, static long madvise_dontneed_free(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, - int behavior) + struct madvise_behavior *madv_behavior) { + int behavior = madv_behavior->behavior; struct mm_struct *mm = vma->vm_mm; *prev = vma; @@ -1102,8 +1108,10 @@ static long madvise_remove(struct vm_area_struct *vma, static int madvise_vma_behavior(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, - unsigned long behavior) + void *behavior_arg) { + struct madvise_behavior *arg = behavior_arg; + int behavior = arg->behavior; int error; struct anon_vma_name *anon_name; unsigned long new_flags = vma->vm_flags; @@ -1123,7 +1131,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, case MADV_FREE: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: - return madvise_dontneed_free(vma, prev, start, end, behavior); + return madvise_dontneed_free(vma, prev, start, end, arg); case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: return madvise_populate(vma, prev, start, end, behavior); @@ -1308,10 +1316,10 @@ static bool process_madvise_behavior_valid(int behavior) */ static int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, - unsigned long end, unsigned long arg, + unsigned long end, void *arg, int (*visit)(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, - unsigned long end, unsigned long arg)) + unsigned long end, void *arg)) { struct vm_area_struct *vma; struct vm_area_struct *prev; @@ -1369,7 +1377,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, static int madvise_vma_anon_name(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, - unsigned long anon_name) + void *anon_name) { int error; @@ -1379,7 +1387,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma, trace_android_vh_update_vma_flags(vma); error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, - (struct anon_vma_name *)anon_name); + anon_name); /* * madvise() returns EAGAIN if kernel resources, such as @@ -1411,7 +1419,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, if (end == start) return 0; - return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name, + return madvise_walk_vmas(mm, start, end, anon_name, madvise_vma_anon_name); } #endif /* CONFIG_ANON_VMA_NAME */ @@ -1537,6 +1545,7 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh int error; size_t len; struct blk_plug plug; + struct madvise_behavior madv_behavior = {.behavior = behavior}; if (!madvise_behavior_valid(behavior)) return -EINVAL; @@ -1569,7 +1578,7 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh end = start + len; blk_start_plug(&plug); - error = madvise_walk_vmas(mm, start, end, behavior, + error = madvise_walk_vmas(mm, start, end, &madv_behavior, madvise_vma_behavior); blk_finish_plug(&plug); From aeb35eb6f21607e1ce58ccbb125e7d86f7eeb78c Mon Sep 17 00:00:00 2001 From: Barry Song Date: Sun, 8 Jun 2025 10:01:50 +1200 Subject: [PATCH 7/9] BACKPORT: mm: use per_vma lock for MADV_DONTNEED MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Certain madvise operations, especially MADV_DONTNEED, occur far more frequently than other madvise options, particularly in native and Java heaps for dynamic memory management. Currently, the mmap_lock is always held during these operations, even when unnecessary. This causes lock contention and can lead to severe priority inversion, where low-priority threads—such as Android's HeapTaskDaemon— hold the lock and block higher-priority threads. This patch enables the use of per-VMA locks when the advised range lies entirely within a single VMA, avoiding the need for full VMA traversal. In practice, userspace heaps rarely issue MADV_DONTNEED across multiple VMAs. Tangquan's testing shows that over 99.5% of memory reclaimed by Android benefits from this per-VMA lock optimization. After extended runtime, 217,735 madvise calls from HeapTaskDaemon used the per-VMA path, while only 1,231 fell back to mmap_lock. To simplify handling, the implementation falls back to the standard mmap_lock if userfaultfd is enabled on the VMA, avoiding the complexity of userfaultfd_remove(). Many thanks to Lorenzo's work[1] on "mm/madvise: support VMA read locks for MADV_DONTNEED[_LOCKED]" Then use this mechanism to permit VMA locking to be done later in the madvise() logic and also to allow altering of the locking mode to permit falling back to an mmap read lock if required." One important point, as pointed out by Jann[2], is that untagged_addr_remote() requires holding mmap_lock. This is because address tagging on x86 and RISC-V is quite complex. Until untagged_addr_remote() becomes atomic—which seems unlikely in the near future—we cannot support per-VMA locks for remote processes. So for now, only local processes are supported. Link: https://lore.kernel.org/all/0b96ce61-a52c-4036-b5b6-5c50783db51f@lucifer.local/ [1] Link: https://lore.kernel.org/all/CAG48ez11zi-1jicHUZtLhyoNPGGVB+ROeAJCUw48bsjk4bbEkA@mail.gmail.com/ [2] Link: https://lkml.kernel.org/r/20250607220150.2980-1-21cnbao@gmail.com Signed-off-by: Barry Song Reviewed-by: Lorenzo Stoakes Acked-by: Qi Zheng Cc: "Liam R. Howlett" Cc: David Hildenbrand Cc: Vlastimil Babka Cc: Jann Horn Cc: Suren Baghdasaryan Cc: Lokesh Gidra Cc: Tangquan Zheng Signed-off-by: Andrew Morton Bug: 425827225 Change-Id: I9485baaf04a09d84e89157dab9bc9185f091947d (cherry picked from commit ff6e99ed6d49725fa50862bdedfc9468c592c9fc git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable) [oven: Moved changes in madvise_do_behavior out to do_madvise and removed changes in vector_madvise. Because these functions haven't been introduced in old kernel. Resolved other minor conflict as well.] Signed-off-by: Oven --- mm/madvise.c | 184 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 141 insertions(+), 43 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 0b8897b7071a..845d9c6e63ed 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -48,36 +48,19 @@ struct madvise_walk_private { void *private; }; +enum madvise_lock_mode { + MADVISE_NO_LOCK, + MADVISE_MMAP_READ_LOCK, + MADVISE_MMAP_WRITE_LOCK, + MADVISE_VMA_READ_LOCK, +}; + struct madvise_behavior { int behavior; struct mmu_gather *tlb; + enum madvise_lock_mode lock_mode; }; -/* - * Any behaviour which results in changes to the vma->vm_flags needs to - * take mmap_lock for writing. Others, which simply traverse vmas, need - * to only take it for reading. - */ -static int madvise_need_mmap_write(int behavior) -{ - switch (behavior) { - case MADV_REMOVE: - case MADV_WILLNEED: - case MADV_DONTNEED: - case MADV_DONTNEED_LOCKED: - case MADV_COLD: - case MADV_PAGEOUT: - case MADV_FREE: - case MADV_POPULATE_READ: - case MADV_POPULATE_WRITE: - case MADV_COLLAPSE: - return 0; - default: - /* be safe, default to 1. list exceptions explicitly */ - return 1; - } -} - #ifdef CONFIG_ANON_VMA_NAME struct anon_vma_name *anon_vma_name_alloc(const char *name) { @@ -1306,6 +1289,44 @@ static bool process_madvise_behavior_valid(int behavior) } } +/* + * Try to acquire a VMA read lock if possible. + * + * We only support this lock over a single VMA, which the input range must + * span either partially or fully. + * + * This function always returns with an appropriate lock held. If a VMA read + * lock could be acquired, we return the locked VMA. + * + * If a VMA read lock could not be acquired, we return NULL and expect caller to + * fallback to mmap lock behaviour. + */ +static struct vm_area_struct *try_vma_read_lock(struct mm_struct *mm, + struct madvise_behavior *madv_behavior, + unsigned long start, unsigned long end) +{ + struct vm_area_struct *vma; + + vma = lock_vma_under_rcu(mm, start); + if (!vma) + goto take_mmap_read_lock; + /* + * Must span only a single VMA; uffd and remote processes are + * unsupported. + */ + if (end > vma->vm_end || current->mm != mm || + userfaultfd_armed(vma)) { + vma_end_read(vma); + goto take_mmap_read_lock; + } + return vma; + +take_mmap_read_lock: + mmap_read_lock(mm); + madv_behavior->lock_mode = MADVISE_MMAP_READ_LOCK; + return NULL; +} + /* * Walk the vmas in range [start,end), and call the visit function on each one. * The visit function will get start and end parameters that cover the overlap @@ -1316,7 +1337,8 @@ static bool process_madvise_behavior_valid(int behavior) */ static int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, - unsigned long end, void *arg, + unsigned long end, struct madvise_behavior *madv_behavior, + void *arg, int (*visit)(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, void *arg)) @@ -1325,6 +1347,20 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, struct vm_area_struct *prev; unsigned long tmp; int unmapped_error = 0; + int error; + + /* + * If VMA read lock is supported, apply madvise to a single VMA + * tentatively, avoiding walking VMAs. + */ + if (madv_behavior && madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK) { + vma = try_vma_read_lock(mm, madv_behavior, start, end); + if (vma) { + error = visit(vma, &prev, start, end, arg); + vma_end_read(vma); + return error; + } + } /* * If the interval [start,end) covers some unmapped address @@ -1336,8 +1372,6 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, prev = vma; for (;;) { - int error; - /* Still start < end. */ if (!vma) return -ENOMEM; @@ -1419,7 +1453,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, if (end == start) return 0; - return madvise_walk_vmas(mm, start, end, anon_name, + return madvise_walk_vmas(mm, start, end, NULL, anon_name, madvise_vma_anon_name); } #endif /* CONFIG_ANON_VMA_NAME */ @@ -1442,29 +1476,93 @@ static bool is_memory_failure(int behavior) } #endif -static int madvise_lock(struct mm_struct *mm, int behavior) +/* + * Any behaviour which results in changes to the vma->vm_flags needs to + * take mmap_lock for writing. Others, which simply traverse vmas, need + * to only take it for reading. + */ +static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavior) { - if (is_memory_failure(behavior)) - return 0; + int behavior = madv_behavior->behavior; - if (madvise_need_mmap_write(behavior)) { + if (is_memory_failure(behavior)) + return MADVISE_NO_LOCK; + + switch (behavior) { + case MADV_REMOVE: + case MADV_WILLNEED: + case MADV_COLD: + case MADV_PAGEOUT: + case MADV_FREE: + case MADV_POPULATE_READ: + case MADV_POPULATE_WRITE: + case MADV_COLLAPSE: + return MADVISE_MMAP_READ_LOCK; + case MADV_DONTNEED: + case MADV_DONTNEED_LOCKED: + return MADVISE_VMA_READ_LOCK; + default: + return MADVISE_MMAP_WRITE_LOCK; + } +} + +static int madvise_lock(struct mm_struct *mm, + struct madvise_behavior *madv_behavior) +{ + enum madvise_lock_mode lock_mode = get_lock_mode(madv_behavior); + + switch (lock_mode) { + case MADVISE_NO_LOCK: + break; + case MADVISE_MMAP_WRITE_LOCK: if (mmap_write_lock_killable(mm)) return -EINTR; - } else { + break; + case MADVISE_MMAP_READ_LOCK: mmap_read_lock(mm); + break; + case MADVISE_VMA_READ_LOCK: + /* We will acquire the lock per-VMA in madvise_walk_vmas(). */ + break; } + + madv_behavior->lock_mode = lock_mode; return 0; } -static void madvise_unlock(struct mm_struct *mm, int behavior) +static void madvise_unlock(struct mm_struct *mm, + struct madvise_behavior *madv_behavior) { - if (is_memory_failure(behavior)) + switch (madv_behavior->lock_mode) { + case MADVISE_NO_LOCK: return; - - if (madvise_need_mmap_write(behavior)) + case MADVISE_MMAP_WRITE_LOCK: mmap_write_unlock(mm); - else + break; + case MADVISE_MMAP_READ_LOCK: mmap_read_unlock(mm); + break; + case MADVISE_VMA_READ_LOCK: + /* We will drop the lock per-VMA in madvise_walk_vmas(). */ + break; + } + + madv_behavior->lock_mode = MADVISE_NO_LOCK; +} + +/* + * untagged_addr_remote() assumes mmap_lock is already held. On + * architectures like x86 and RISC-V, tagging is tricky because each + * mm may have a different tagging mask. However, we might only hold + * the per-VMA lock (currently only local processes are supported), + * so untagged_addr is used to avoid the mmap_lock assertion for + * local processes. + */ +static inline unsigned long get_untagged_addr(struct mm_struct *mm, + unsigned long start) +{ + return current->mm == mm ? untagged_addr(start) : + untagged_addr_remote(mm, start); } /* @@ -1565,7 +1663,7 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh if (end == start) return 0; - error = madvise_lock(mm, behavior); + error = madvise_lock(mm, &madv_behavior); if (error) return error; @@ -1574,15 +1672,15 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh return madvise_inject_error(behavior, start, start + len_in); #endif - start = untagged_addr_remote(mm, start); + start = get_untagged_addr(mm, start); end = start + len; blk_start_plug(&plug); error = madvise_walk_vmas(mm, start, end, &madv_behavior, - madvise_vma_behavior); + &madv_behavior, madvise_vma_behavior); blk_finish_plug(&plug); - madvise_unlock(mm, behavior); + madvise_unlock(mm, &madv_behavior); return error; } From 75a0fcbfdf35e712350c1c411c0735af125ab124 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 26 Jun 2025 20:10:38 +0000 Subject: [PATCH 8/9] ANDROID: look up vma under RCU in linker_ctx() madvise_dontneed_single_vma() calls linker_ctx() to detect whether the madvise was initiated by the dynamic linker. This function requires mmap_lock in order to lookup the vma, however with recent changes we do not hold mmap_lock while executing MADV_DONTNEED. Lookup the vma under RCU instead to avoid lockdep warning. Bug: 425827225 Change-Id: Ie5e0243f359b96292d4f32ee3299050b871dc6c5 Signed-off-by: Suren Baghdasaryan --- mm/pgsize_migration.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mm/pgsize_migration.c b/mm/pgsize_migration.c index f72240b0de79..07dd1f3fa7d8 100644 --- a/mm/pgsize_migration.c +++ b/mm/pgsize_migration.c @@ -167,8 +167,6 @@ static __always_inline bool str_has_suffix(const char *str, const char *suffix) * VMAs of the current task. * * Returns true if in linker context, otherwise false. - * - * Caller must hold mmap lock in read mode. */ static inline bool linker_ctx(void) { @@ -180,14 +178,14 @@ static inline bool linker_ctx(void) if (!regs) return false; - vma = find_vma(mm, instruction_pointer(regs)); + vma = lock_vma_under_rcu(mm, instruction_pointer(regs)); /* Current execution context, the VMA must be present */ BUG_ON(!vma); file = vma->vm_file; if (!file) - return false; + goto out; if ((vma->vm_flags & VM_EXEC)) { char buf[64]; @@ -205,10 +203,13 @@ static inline bool linker_ctx(void) * * Check the base name (linker64). */ - if (!strcmp(kbasename(path), "linker64")) + if (!strcmp(kbasename(path), "linker64")) { + vma_end_read(vma); return true; + } } - +out: + vma_end_read(vma); return false; } From 464ddce4070dfa9633c25d958adffaa3f3ef5a75 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Thu, 26 Jun 2025 13:25:58 -0700 Subject: [PATCH 9/9] ANDROID: mthp: Prevent TAO non-movable allocations from movable zone aosp/I30ac33034f0ff697a4330ef752babf94d4e234f5 removed the ability to allocate non-movable __GPF_COMP allocations from TAO zones as they could fallback to the movable-zone; causing various issues. This was reintroduced by aosp/I2fdfc4df8b03daa96fd6c2c8c6630d26a8509ad0 iff the movable zone is not enabled; adding additional checks to verify this. However the case of movable_node command line parameter was missed; when this is set memory blocks can be onlined into the movable-zone. Add a check for !movable_node_is_enabled() to avoid such __GPF_COMP allocations from TAO (virtual zones) if it's possible that memory blocks can come online to the movable-zone. Bug: 427924381 Bug: 313807618 Bug: 353906885 Change-Id: If2ebd4d7f4badd99599b01939cd89d0cba9b3fb5 Signed-off-by: Kalesh Singh --- include/linux/gfp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 43dc4463388b..808caa5c170c 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -139,9 +139,9 @@ static inline enum zone_type __gfp_zone(gfp_t flags) if (z == ZONE_MOVABLE) return LAST_VIRT_ZONE; - /* Allow dma-buf etc to use virtual zones */ + /* Allow dma-buf etc to use virtual zones, if there is no movable zone */ if ((flags & __GFP_COMP) && (flags & __GFP_HIGHMEM) && - !static_branch_unlikely(&movablecore_enabled)) + !static_branch_unlikely(&movablecore_enabled) && !movable_node_is_enabled()) return LAST_VIRT_ZONE; return z;