From 56cc224601caca24f778f1b38a4a1b09a1b31a8f Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Mon, 23 Jun 2025 13:39:53 +0000
Subject: [PATCH 1/9] ANDROID: BACKPORT: KVM: arm64: Always unmap the pvmfw
 region at stage-2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The donation of the pvmfw region to pKVM is currently done transparently
as part of fix_host_ownership(). However, this function only runs over PA
ranges covered by the memblock list, although there is no guarantee for
the pvmfw region to be advertised in a memory node in DT. In this case,
the pKVM init will appear to succeed while silently keeping valid host
stage-2 mappings to the pvmfw region.

Fix this by forcefully registering the pvmfw region in the pKVM memblock
list.

BACKPORT: Fix usage of pvmfw_size and pvmfw_base which are pointers in
6.6 and earlier.

Bug: 278749606
Bug: 424382332
Reported-by: Bartłomiej Grzesik <bgrzesik@google.com>
Suggested-by: Will Deacon <willdeacon@google.com>
Change-Id: I8f5498df25debb432b7dffd1e40a8910bcec7b49
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/pkvm.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index 4523cc6f2725..61b524129366 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -65,6 +65,7 @@ static void __init sort_memblock_regions(void)
 static int __init register_memblock_regions(void)
 {
 	struct memblock_region *reg;
+	bool pvmfw_in_mem = false;
 
 	for_each_mem_region(reg) {
 		if (*hyp_memblock_nr_ptr >= HYP_MEMBLOCK_REGIONS)
@@ -72,6 +73,27 @@ static int __init register_memblock_regions(void)
 
 		hyp_memory[*hyp_memblock_nr_ptr] = *reg;
 		(*hyp_memblock_nr_ptr)++;
+
+		if (!*pvmfw_size || pvmfw_in_mem ||
+			!memblock_addrs_overlap(reg->base, reg->size, *pvmfw_base, *pvmfw_size))
+			continue;
+		/* If the pvmfw region overlaps a memblock, it must be a subset */
+		if (*pvmfw_base < reg->base ||
+				(*pvmfw_base + *pvmfw_size) > (reg->base + reg->size))
+			return -EINVAL;
+		pvmfw_in_mem = true;
+	}
+
+	if (*pvmfw_size && !pvmfw_in_mem) {
+		if (*hyp_memblock_nr_ptr >= HYP_MEMBLOCK_REGIONS)
+			return -ENOMEM;
+
+		hyp_memory[*hyp_memblock_nr_ptr] = (struct memblock_region) {
+			.base   = *pvmfw_base,
+			.size   = *pvmfw_size,
+			.flags  = MEMBLOCK_NOMAP,
+		};
+		(*hyp_memblock_nr_ptr)++;
 	}
 	sort_memblock_regions();
 

From 390d8897c3243bbb3b122a08b4d4b6e4bd5e796d Mon Sep 17 00:00:00 2001
From: Qianfeng Rong <rongqianfeng@vivo.corp-partner.google.com>
Date: Wed, 25 Jun 2025 20:10:29 +0800
Subject: [PATCH 2/9] ANDROID: vendor hooks: Add new android_rvh for adjust
 water mark

The trace_android_vh_alloc_pages_adjust_wmark() and
trace_android_vh_alloc_pages_reset_wmark() have been deprecated,
because they cannot be used in a CPU offline or non-atomic context,
the trace_android_rvh_alloc_pages_adjust_wmark() and
trace_android_rvh_alloc_pages_reset_wmark() should be used instead.

Bug: 427378244

Change-Id: I641a4bb5548120686a67a56067648b4e23b2f0e1
Signed-off-by: Qianfeng Rong <rongqianfeng@vivo.corp-partner.google.com>
---
 drivers/android/vendor_hooks.c |  2 ++
 include/trace/hooks/mm.h       |  9 +++++++++
 mm/page_alloc.c                | 16 +++++++++++++++-
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/drivers/android/vendor_hooks.c b/drivers/android/vendor_hooks.c
index eaea41831d1f..d3f7ff4fde56 100644
--- a/drivers/android/vendor_hooks.c
+++ b/drivers/android/vendor_hooks.c
@@ -463,6 +463,8 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_pages_reclaim_bypass);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_pages_failure_bypass);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_pages_adjust_wmark);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_pages_reset_wmark);
+EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_alloc_pages_adjust_wmark);
+EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_alloc_pages_reset_wmark);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_watermark_fast_ok);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_fiq_dump);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_swapmem_gather_init);
diff --git a/include/trace/hooks/mm.h b/include/trace/hooks/mm.h
index 00df4c5ea263..65eb40c00944 100644
--- a/include/trace/hooks/mm.h
+++ b/include/trace/hooks/mm.h
@@ -156,6 +156,15 @@ DECLARE_HOOK(android_vh_alloc_pages_reset_wmark,
 	unsigned long direct_reclaim_retries),
 	TP_ARGS(gfp_mask, order, alloc_flags, did_some_progress,
 	no_progress_loops, direct_reclaim_retries));
+DECLARE_RESTRICTED_HOOK(android_rvh_alloc_pages_adjust_wmark,
+	TP_PROTO(gfp_t gfp_mask, int order, int *alloc_flags),
+	TP_ARGS(gfp_mask, order, alloc_flags), 3);
+DECLARE_RESTRICTED_HOOK(android_rvh_alloc_pages_reset_wmark,
+	TP_PROTO(gfp_t gfp_mask, int order, int *alloc_flags,
+	unsigned long *did_some_progress, int *no_progress_loops,
+	unsigned long direct_reclaim_retries),
+	TP_ARGS(gfp_mask, order, alloc_flags, did_some_progress,
+	no_progress_loops, direct_reclaim_retries), 6);
 DECLARE_HOOK(android_vh_unreserve_highatomic_bypass,
 	TP_PROTO(bool force, struct zone *zone, bool *skip_unreserve_highatomic),
 	TP_ARGS(force, zone, skip_unreserve_highatomic));
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b42afcd0d3c3..152b0424fcbf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4438,8 +4438,15 @@ restart:
 	if (alloc_flags & ALLOC_KSWAPD)
 		wake_all_kswapds(order, gfp_mask, ac);
 
-	if (can_direct_reclaim && !direct_reclaim_retries && !(current->flags & PF_MEMALLOC))
+	if (can_direct_reclaim && !direct_reclaim_retries && !(current->flags & PF_MEMALLOC)) {
+		/*
+		 * The trace_android_vh_alloc_pages_adjust_wmark() has been deprecated
+		 * because it cannot be used in a CPU offline or non-atomic context,
+		 * please use trace_android_rvh_alloc_pages_adjust_wmark().
+		 */
 		trace_android_vh_alloc_pages_adjust_wmark(gfp_mask, order, &alloc_flags);
+		trace_android_rvh_alloc_pages_adjust_wmark(gfp_mask, order, &alloc_flags);
+	}
 
 	/*
 	 * The adjusted alloc_flags might result in immediate success, so try
@@ -4587,8 +4594,15 @@ retry:
 			     !(gfp_mask & __GFP_RETRY_MAYFAIL)))
 		goto nopage;
 
+	/*
+	 * The trace_android_vh_alloc_pages_reset_wmark() has been deprecated
+	 * because it cannot be used in a CPU offline or non-atomic context,
+	 * please use trace_android_rvh_alloc_pages_reset_wmark().
+	 */
 	trace_android_vh_alloc_pages_reset_wmark(gfp_mask, order,
 		&alloc_flags, &did_some_progress, &no_progress_loops, direct_reclaim_retries);
+	trace_android_rvh_alloc_pages_reset_wmark(gfp_mask, order,
+		&alloc_flags, &did_some_progress, &no_progress_loops, direct_reclaim_retries);
 
 	if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
 				 did_some_progress > 0, &no_progress_loops))

From 1c1f2b75268a960070edd1c8c43da0214b154b0f Mon Sep 17 00:00:00 2001
From: Qianfeng Rong <rongqianfeng@vivo.corp-partner.google.com>
Date: Wed, 25 Jun 2025 20:22:58 +0800
Subject: [PATCH 3/9] ANDROID: GKI: vivo add symbols to symbol list

2 function symbol(s) added
  'int __traceiter_android_rvh_alloc_pages_adjust_wmark(void*, gfp_t,
  int, int*)'
  'int __traceiter_android_rvh_alloc_pages_reset_wmark(void*, gfp_t,
  int, int*, unsigned long*, int*, unsigned long)'

2 variable symbol(s) added
  'struct tracepoint __tracepoint_android_rvh_alloc_pages_adjust_wmark'
  'struct tracepoint __tracepoint_android_rvh_alloc_pages_reset_wmark'

Bug: 427378244

Change-Id: I2929065b78ae40226c7da679eadd898259e4b9e7
Signed-off-by: Qianfeng Rong <rongqianfeng@vivo.corp-partner.google.com>
---
 android/abi_gki_aarch64.stg  | 40 ++++++++++++++++++++++++++++++++++++
 android/abi_gki_aarch64_vivo |  4 ++++
 2 files changed, 44 insertions(+)

diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg
index 0de7176f5e86..d8ed1e76d857 100644
--- a/android/abi_gki_aarch64.stg
+++ b/android/abi_gki_aarch64.stg
@@ -359430,6 +359430,24 @@ elf_symbol {
   type_id: 0x9bc8472e
   full_name: "__traceiter_android_rvh_alloc_and_link_pwqs"
 }
+elf_symbol {
+  id: 0xc0fd1a1f
+  name: "__traceiter_android_rvh_alloc_pages_adjust_wmark"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x6a18478a
+  type_id: 0x9870a448
+  full_name: "__traceiter_android_rvh_alloc_pages_adjust_wmark"
+}
+elf_symbol {
+  id: 0x6eed3175
+  name: "__traceiter_android_rvh_alloc_pages_reset_wmark"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x414d4c97
+  type_id: 0x9870a59a
+  full_name: "__traceiter_android_rvh_alloc_pages_reset_wmark"
+}
 elf_symbol {
   id: 0xef79dd4d
   name: "__traceiter_android_rvh_alloc_workqueue"
@@ -366522,6 +366540,24 @@ elf_symbol {
   type_id: 0x18ccbd2c
   full_name: "__tracepoint_android_rvh_alloc_and_link_pwqs"
 }
+elf_symbol {
+  id: 0x89ff3495
+  name: "__tracepoint_android_rvh_alloc_pages_adjust_wmark"
+  is_defined: true
+  symbol_type: OBJECT
+  crc: 0xc63d662f
+  type_id: 0x18ccbd2c
+  full_name: "__tracepoint_android_rvh_alloc_pages_adjust_wmark"
+}
+elf_symbol {
+  id: 0xab6e1e0f
+  name: "__tracepoint_android_rvh_alloc_pages_reset_wmark"
+  is_defined: true
+  symbol_type: OBJECT
+  crc: 0xdbce1a35
+  type_id: 0x18ccbd2c
+  full_name: "__tracepoint_android_rvh_alloc_pages_reset_wmark"
+}
 elf_symbol {
   id: 0x0b219d2b
   name: "__tracepoint_android_rvh_alloc_workqueue"
@@ -436771,6 +436807,8 @@ interface {
   symbol_id: 0xb42422d5
   symbol_id: 0xb3d70eab
   symbol_id: 0x9ca1a40f
+  symbol_id: 0xc0fd1a1f
+  symbol_id: 0x6eed3175
   symbol_id: 0xef79dd4d
   symbol_id: 0x0b48afa1
   symbol_id: 0xa927338c
@@ -437559,6 +437597,8 @@ interface {
   symbol_id: 0x4b7a8fd7
   symbol_id: 0xcd36f539
   symbol_id: 0x33f0c37d
+  symbol_id: 0x89ff3495
+  symbol_id: 0xab6e1e0f
   symbol_id: 0x0b219d2b
   symbol_id: 0x748c1fd7
   symbol_id: 0xcb42202e
diff --git a/android/abi_gki_aarch64_vivo b/android/abi_gki_aarch64_vivo
index b8f2d60402fd..093e3588d263 100644
--- a/android/abi_gki_aarch64_vivo
+++ b/android/abi_gki_aarch64_vivo
@@ -108,9 +108,11 @@
   __traceiter_android_vh_account_process_tick_gran
   __traceiter_android_vh_adjust_kvmalloc_flags
   __traceiter_android_vh_alloc_pages_adjust_wmark
+  __traceiter_android_rvh_alloc_pages_adjust_wmark
   __traceiter_android_vh_alloc_pages_failure_bypass
   __traceiter_android_vh_alloc_pages_reclaim_bypass
   __traceiter_android_vh_alloc_pages_reset_wmark
+  __traceiter_android_rvh_alloc_pages_reset_wmark
   __traceiter_android_vh_alter_mutex_list_add
   __traceiter_android_vh_alter_rwsem_list_add
   __traceiter_android_vh_bd_link_disk_holder
@@ -241,9 +243,11 @@
   __tracepoint_android_vh_account_process_tick_gran
   __tracepoint_android_vh_adjust_kvmalloc_flags
   __tracepoint_android_vh_alloc_pages_adjust_wmark
+  __tracepoint_android_rvh_alloc_pages_adjust_wmark
   __tracepoint_android_vh_alloc_pages_failure_bypass
   __tracepoint_android_vh_alloc_pages_reclaim_bypass
   __tracepoint_android_vh_alloc_pages_reset_wmark
+  __tracepoint_android_rvh_alloc_pages_reset_wmark
   __tracepoint_android_vh_alter_mutex_list_add
   __tracepoint_android_vh_alter_rwsem_list_add
   __tracepoint_android_vh_bd_link_disk_holder

From fe3caa5756634542c9d65436ad7c196fac7edcde Mon Sep 17 00:00:00 2001
From: Juan Yescas <jyescas@google.com>
Date: Wed, 25 Jun 2025 17:53:01 -0700
Subject: [PATCH 4/9] ANDROID: mm: Set __GFP_CMA in do_swap_page() for folio
 allocations

In the do_swap_page() path, the memory allocations were failing
even if there were free CMA pages. The allocations were not fallbacking
to CMA bucket.

This was due the requested folios were not marked as __GFP_CMA and as
a consequence, the ALLOC_CMA was not set.

```
static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask,
						  unsigned int alloc_flags)
{
	/*
	 * If cma_redirect_restricted is true, set ALLOC_CMA only for
	 * movable allocations that have __GFP_CMA.
	 */
	if ((!cma_redirect_restricted() || gfp_mask & __GFP_CMA) &&
	    gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
		alloc_flags |= ALLOC_CMA;
	trace_android_vh_alloc_flags_cma_adjust(gfp_mask, &alloc_flags);
	return alloc_flags;
}
```

This was introduced in the change I9d16a9cae1c6c0f6cdb03183038fab095843001e
("BACKPORT: mm: support large folios swap-in for sync io devices")

Bug: 427802573
Bug: 425779146
Bug: 422586344
Bug: 313807618
Test: Built and run kernel
Change-Id: Ied33777bb04198f1e4a69b91f002ae70d0471bb3
Fixes: 988dc02cddcb ("BACKPORT: mm: support large folios swap-in for sync io devices")
Signed-off-by: Juan Yescas <jyescas@google.com>
---
 mm/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/memory.c b/mm/memory.c
index dfbd0a2795db..a04841dc9291 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3962,7 +3962,7 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf)
 	struct folio *folio;
 	swp_entry_t entry;
 
-	folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma,
+	folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE|__GFP_CMA, 0, vma,
 				vmf->address, false);
 	if (!folio)
 		return NULL;

From 87cddfadcdd3614e5d3123bbfa55683d8389a8b8 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 5 Feb 2025 22:15:14 -0800
Subject: [PATCH 5/9] BACKPORT: mm/madvise: split out mmap locking operations
 for madvise()

Patch series "mm/madvise: remove redundant mmap_lock operations from
process_madvise()".

process_madvise() calls do_madvise() for each address range.  Then, each
do_madvise() invocation holds and releases same mmap_lock.  Optimize the
redundant lock operations by splitting do_madvise() internal logic
including the mmap_lock operations, and calling the small logic directly
from process_madvise() in a sequence that removes the redundant locking.
As a result of this change, process_madvise() becomes more efficient and
less racy in terms of its results and latency.

Note that the potential downside of this series is that other mmap_lock
holders may take more time due to the increased length of mmap_lock
critical section for process_madvise() calls.  But there is maximum limit
in the kernel space (IOV_MAX), and userspace can control the critical
section length by setting the request size.  Hence, the downside would be
limited and controllable.

Evaluation
==========

I measured the time to apply MADV_DONTNEED advice to 256 MiB memory using
multiple madvise() calls, 4 KiB per each call.  I also do the same with
process_madvise(), but with varying batch size (vlen) from 1 to 1024.  The
source code for the measurement is available at GitHub[1].  Because the
microbenchmark result is not that stable, I ran each configuration five
times and use the average.

The measurement results are as below.  'sz_batches' column shows the batch
size of process_madvise() calls.  '0' batch size is for madvise() calls
case.  'before' and 'after' columns are the measured time to apply
MADV_DONTNEED to the 256 MiB memory buffer in nanoseconds, on kernels that
built without and with the last patch of this series, respectively.  So
lower value means better efficiency.  'after/before' column is the ratio
of 'after' to 'before'.

    sz_batches  before       after        after/before
    0           146294215.2  121280536.2  0.829017989769427
    1           165851018.8  136305598.2  0.821855658085351
    2           129469321.2  103740383.6  0.801273866569094
    4           110369232.4  87835896.2   0.795836795182785
    8           102906232.4  77420920.2   0.752344327397609
    16          97551017.4   74959714.4   0.768415506038587
    32          94809848.2   71200848.4   0.750985786305689
    64          96087575.6   72593180     0.755489765942227
    128         96154163.8   68517055.4   0.712575022154163
    256         92901257.6   69054216.6   0.743307662177439
    512         93646170.8   67053296.2   0.716028168874151
    1024        92663219.2   70168196.8   0.75723892830177

Despite the unstable nature of the test program, the trend is as we
expect.  The measurement shows this patchset reduces the process_madvise()
latency, proportional to the batching size.  The latency gain was about
20% with the batch size 2, and it has increased to about 28% with the
batch size 512, since more number of mmap locking is reduced with larger
batch size.

Note that the standard devitation of the measurements for each sz_batches
configuration ranged from 1.9% to 7.2%.  That is, this result is not very
stable.  The average of the standard deviations for different batch sizes
were 4.62% and 4.70% for the 'before' and 'after' kernel measurements.

Also note that this patch has somehow decreased latencies of madvise() and
single batch size process_madvise().  Seems this code path is small enough
to significantly be affected by compiler optimizations including inlining
of split-out functions.  Please focus on only the improvement amount that
changed by the batch size.

[1] https://github.com/sjp38/eval_proc_madvise

This patch (of 4):

Split out the madvise behavior-dependent mmap_lock operations from
do_madvise(), for easier reuse of the logic in an upcoming change.

[lorenzo.stoakes@oracle.com: fix madvise_[un]lock() issue]
  Link: https://lkml.kernel.org/r/2f448f7b-1da7-4099-aa9e-0179d47fde40@lucifer.local
[akpm@linux-foundation.org: coding-style cleanups]
Link: https://lkml.kernel.org/r/20250206061517.2958-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20250206061517.2958-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Davidlohr Bueso <dave@stgolabs.net>
Reviewed-by: Liam R. Howlett <howlett@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Bug: 425827225
Change-Id: Ic87850b33b47049d65a07270a37616b6e829d7ee
(cherry picked from commit 4cc39f91ef6c6f876651eb231974a59ffbcb3a21
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-unstable)
Signed-off-by: Oven <liyangouwen1@oppo.com>
---
 mm/madvise.c | 62 +++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 49 insertions(+), 13 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index b36a6a32a1e3..d63c162157fa 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1415,6 +1415,50 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
 				 madvise_vma_anon_name);
 }
 #endif /* CONFIG_ANON_VMA_NAME */
+
+#ifdef CONFIG_MEMORY_FAILURE
+static bool is_memory_failure(int behavior)
+{
+	switch (behavior) {
+	case MADV_HWPOISON:
+	case MADV_SOFT_OFFLINE:
+		return true;
+	default:
+		return false;
+	}
+}
+#else
+static bool is_memory_failure(int behavior)
+{
+	return false;
+}
+#endif
+
+static int madvise_lock(struct mm_struct *mm, int behavior)
+{
+	if (is_memory_failure(behavior))
+		return 0;
+
+	if (madvise_need_mmap_write(behavior)) {
+		if (mmap_write_lock_killable(mm))
+			return -EINTR;
+	} else {
+		mmap_read_lock(mm);
+	}
+	return 0;
+}
+
+static void madvise_unlock(struct mm_struct *mm, int behavior)
+{
+	if (is_memory_failure(behavior))
+		return;
+
+	if (madvise_need_mmap_write(behavior))
+		mmap_write_unlock(mm);
+	else
+		mmap_read_unlock(mm);
+}
+
 /*
  * The madvise(2) system call.
  *
@@ -1491,7 +1535,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
 {
 	unsigned long end;
 	int error;
-	int write;
 	size_t len;
 	struct blk_plug plug;
 
@@ -1513,19 +1556,15 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
 	if (end == start)
 		return 0;
 
+	error = madvise_lock(mm, behavior);
+	if (error)
+		return error;
+
 #ifdef CONFIG_MEMORY_FAILURE
 	if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
 		return madvise_inject_error(behavior, start, start + len_in);
 #endif
 
-	write = madvise_need_mmap_write(behavior);
-	if (write) {
-		if (mmap_write_lock_killable(mm))
-			return -EINTR;
-	} else {
-		mmap_read_lock(mm);
-	}
-
 	start = untagged_addr_remote(mm, start);
 	end = start + len;
 
@@ -1534,10 +1573,7 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
 			madvise_vma_behavior);
 	blk_finish_plug(&plug);
 
-	if (write)
-		mmap_write_unlock(mm);
-	else
-		mmap_read_unlock(mm);
+	madvise_unlock(mm, behavior);
 
 	return error;
 }

From 79dea4ada64b388354fa0e68e6ae61b0a0e2e08d Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 9 Apr 2025 17:00:19 -0700
Subject: [PATCH 6/9] BACKPORT: mm/madvise: define and use madvise_behavior
 struct for madvise_do_behavior()

Patch series "mm/madvise: batch tlb flushes for MADV_DONTNEED and
MADV_FREE", v3.

When process_madvise() is called to do MADV_DONTNEED[_LOCKED] or MADV_FREE
with multiple address ranges, tlb flushes happen for each of the given
address ranges.  Because such tlb flushes are for the same process, doing
those in a batch is more efficient while still being safe.  Modify
process_madvise() entry level code path to do such batched tlb flushes,
while the internal unmap logic do only gathering of the tlb entries to
flush.

In more detail, modify the entry functions to initialize an mmu_gather
object and pass it to the internal logic.  And make the internal logic do
only gathering of the tlb entries to flush into the received mmu_gather
object.  After all internal function calls are done, the entry functions
flush the gathered tlb entries at once.

Because process_madvise() and madvise() share the internal unmap logic,
make same change to madvise() entry code together, to make code consistent
and cleaner.  It is only for keeping the code clean, and shouldn't degrade
madvise().  It could rather provide a potential tlb flushes reduction
benefit for a case that there are multiple vmas for the given address
range.  It is only a side effect from an effort to keep code clean, so we
don't measure it separately.

Similar optimizations might be applicable to other madvise behavior such
as MADV_COLD and MADV_PAGEOUT.  Those are simply out of the scope of this
patch series, though.

Patches Sequence
================

The first patch defines a new data structure for managing information that
is required for batched tlb flushes (mmu_gather and behavior), and update
code paths for MADV_DONTNEED[_LOCKED] and MADV_FREE handling internal
logic to receive it.

The second patch batches tlb flushes for MADV_FREE handling for both
madvise() and process_madvise().

Remaining two patches are for MADV_DONTNEED[_LOCKED] tlb flushes batching.
The third patch splits zap_page_range_single() for batching of
MADV_DONTNEED[_LOCKED] handling.  The fourth patch batches tlb flushes for
the hint using the sub-logic that the third patch split out, and the
helpers for batched tlb flushes that introduced for the MADV_FREE case, by
the second patch.

Test Results
============

I measured the latency to apply MADV_DONTNEED advice to 256 MiB memory
using multiple process_madvise() calls.  I apply the advice in 4 KiB sized
regions granularity, but with varying batch size per process_madvise()
call (vlen) from 1 to 1024.  The source code for the measurement is
available at GitHub[1].  To reduce measurement errors, I did the
measurement five times.

The measurement results are as below.  'sz_batch' column shows the batch
size of process_madvise() calls.  'Before' and 'After' columns show the
average of latencies in nanoseconds that measured five times on kernels
that built without and with the tlb flushes batching of this series
(patches 3 and 4), respectively.  For the baseline, mm-new tree of
2025-04-09[2] has been used, after reverting the second version of this
patch series and adding a temporal fix for !CONFIG_DEBUG_VM build
failure[3].  'B-stdev' and 'A-stdev' columns show ratios of latency
measurements standard deviation to average in percent for 'Before' and
'After', respectively.  'Latency_reduction' shows the reduction of the
latency that the 'After' has achieved compared to 'Before', in percent.
Higher 'Latency_reduction' values mean more efficiency improvements.

    sz_batch  Before      B-stdev  After        A-stdev  Latency_reduction
    1         146386348   2.78     111327360.6  3.13     23.95
    2         108222130   1.54     72131173.6   2.39     33.35
    4         93617846.8  2.76     51859294.4   2.50     44.61
    8         80555150.4  2.38     44328790     1.58     44.97
    16        77272777    1.62     37489433.2   1.16     51.48
    32        76478465.2  2.75     33570506     3.48     56.10
    64        75810266.6  1.15     27037652.6   1.61     64.34
    128       73222748    3.86     25517629.4   3.30     65.15
    256       72534970.8  2.31     25002180.4   0.94     65.53
    512       71809392    5.12     24152285.4   2.41     66.37
    1024      73281170.2  4.53     24183615     2.09     67.00

Unexpectedly the latency has reduced (improved) even with batch size one.
I think some of compiler optimizations have affected that, like also
observed with the first version of this patch series.

So, please focus on the proportion between the improvement and the batch
size.  As expected, tlb flushes batching provides latency reduction that
proportional to the batch size.  The efficiency gain ranges from about 33
percent with batch size 2, and up to 67 percent with batch size 1,024.

Please note that this is a very simple microbenchmark, so real efficiency
gain on real workload could be very different.

This patch (of 4):

To implement batched tlb flushes for MADV_DONTNEED[_LOCKED] and MADV_FREE,
an mmu_gather object in addition to the behavior integer need to be passed
to the internal logics.  Using a struct can make it easy without
increasing the number of parameters of all code paths towards the internal
logic.  Define a struct for the purpose and use it on the code path that
starts from madvise_do_behavior() and ends on madvise_dontneed_free().
Note that this changes madvise_walk_vmas() visitor type signature, too.
Specifically, it changes its 'arg' type from 'unsigned long' to the new
struct pointer.

Link: https://lkml.kernel.org/r/20250410000022.1901-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20250410000022.1901-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Liam R. Howlett <howlett@gmail.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Bug: 425827225
Change-Id: I4e37381a88f0f606f7fc07fc63af08c3692e0886
(cherry picked from commit 066c770437835d2bd2072bd2c88a71fcbbd5ccb3
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-unstable)
[oven: Moved changes in madvise_do_behavior to do_madvise and removed
changes in vector_madvise. Because these functions haven't been
introduced in old kernel.]
Signed-off-by: Oven <liyangouwen1@oppo.com>
---
 mm/madvise.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index d63c162157fa..0b8897b7071a 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -48,6 +48,11 @@ struct madvise_walk_private {
 	void *private;
 };
 
+struct madvise_behavior {
+	int behavior;
+	struct mmu_gather *tlb;
+};
+
 /*
  * Any behaviour which results in changes to the vma->vm_flags needs to
  * take mmap_lock for writing. Others, which simply traverse vmas, need
@@ -941,8 +946,9 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
 static long madvise_dontneed_free(struct vm_area_struct *vma,
 				  struct vm_area_struct **prev,
 				  unsigned long start, unsigned long end,
-				  int behavior)
+				  struct madvise_behavior *madv_behavior)
 {
+	int behavior = madv_behavior->behavior;
 	struct mm_struct *mm = vma->vm_mm;
 
 	*prev = vma;
@@ -1102,8 +1108,10 @@ static long madvise_remove(struct vm_area_struct *vma,
 static int madvise_vma_behavior(struct vm_area_struct *vma,
 				struct vm_area_struct **prev,
 				unsigned long start, unsigned long end,
-				unsigned long behavior)
+				void *behavior_arg)
 {
+	struct madvise_behavior *arg = behavior_arg;
+	int behavior = arg->behavior;
 	int error;
 	struct anon_vma_name *anon_name;
 	unsigned long new_flags = vma->vm_flags;
@@ -1123,7 +1131,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
 	case MADV_FREE:
 	case MADV_DONTNEED:
 	case MADV_DONTNEED_LOCKED:
-		return madvise_dontneed_free(vma, prev, start, end, behavior);
+		return madvise_dontneed_free(vma, prev, start, end, arg);
 	case MADV_POPULATE_READ:
 	case MADV_POPULATE_WRITE:
 		return madvise_populate(vma, prev, start, end, behavior);
@@ -1308,10 +1316,10 @@ static bool process_madvise_behavior_valid(int behavior)
  */
 static
 int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
-		      unsigned long end, unsigned long arg,
+		      unsigned long end, void *arg,
 		      int (*visit)(struct vm_area_struct *vma,
 				   struct vm_area_struct **prev, unsigned long start,
-				   unsigned long end, unsigned long arg))
+				   unsigned long end, void *arg))
 {
 	struct vm_area_struct *vma;
 	struct vm_area_struct *prev;
@@ -1369,7 +1377,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
 static int madvise_vma_anon_name(struct vm_area_struct *vma,
 				 struct vm_area_struct **prev,
 				 unsigned long start, unsigned long end,
-				 unsigned long anon_name)
+				 void *anon_name)
 {
 	int error;
 
@@ -1379,7 +1387,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma,
 
 	trace_android_vh_update_vma_flags(vma);
 	error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
-				   (struct anon_vma_name *)anon_name);
+				   anon_name);
 
 	/*
 	 * madvise() returns EAGAIN if kernel resources, such as
@@ -1411,7 +1419,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
 	if (end == start)
 		return 0;
 
-	return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
+	return madvise_walk_vmas(mm, start, end, anon_name,
 				 madvise_vma_anon_name);
 }
 #endif /* CONFIG_ANON_VMA_NAME */
@@ -1537,6 +1545,7 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
 	int error;
 	size_t len;
 	struct blk_plug plug;
+	struct madvise_behavior madv_behavior = {.behavior = behavior};
 
 	if (!madvise_behavior_valid(behavior))
 		return -EINVAL;
@@ -1569,7 +1578,7 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
 	end = start + len;
 
 	blk_start_plug(&plug);
-	error = madvise_walk_vmas(mm, start, end, behavior,
+	error = madvise_walk_vmas(mm, start, end, &madv_behavior,
 			madvise_vma_behavior);
 	blk_finish_plug(&plug);
 

From aeb35eb6f21607e1ce58ccbb125e7d86f7eeb78c Mon Sep 17 00:00:00 2001
From: Barry Song <v-songbaohua@oppo.com>
Date: Sun, 8 Jun 2025 10:01:50 +1200
Subject: [PATCH 7/9] BACKPORT: mm: use per_vma lock for MADV_DONTNEED
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Certain madvise operations, especially MADV_DONTNEED, occur far more
frequently than other madvise options, particularly in native and Java
heaps for dynamic memory management.

Currently, the mmap_lock is always held during these operations, even when
unnecessary.  This causes lock contention and can lead to severe priority
inversion, where low-priority threads—such as Android's
HeapTaskDaemon— hold the lock and block higher-priority threads.

This patch enables the use of per-VMA locks when the advised range lies
entirely within a single VMA, avoiding the need for full VMA traversal.
In practice, userspace heaps rarely issue MADV_DONTNEED across multiple
VMAs.

Tangquan's testing shows that over 99.5% of memory reclaimed by Android
benefits from this per-VMA lock optimization.  After extended runtime,
217,735 madvise calls from HeapTaskDaemon used the per-VMA path, while
only 1,231 fell back to mmap_lock.

To simplify handling, the implementation falls back to the standard
mmap_lock if userfaultfd is enabled on the VMA, avoiding the complexity of
userfaultfd_remove().

Many thanks to Lorenzo's work[1] on "mm/madvise: support VMA read locks
for MADV_DONTNEED[_LOCKED]"

Then use this mechanism to permit VMA locking to be done later in the
madvise() logic and also to allow altering of the locking mode to permit
falling back to an mmap read lock if required."

One important point, as pointed out by Jann[2], is that
untagged_addr_remote() requires holding mmap_lock.  This is because
address tagging on x86 and RISC-V is quite complex.

Until untagged_addr_remote() becomes atomic—which seems unlikely in the
near future—we cannot support per-VMA locks for remote processes.  So
for now, only local processes are supported.

Link: https://lore.kernel.org/all/0b96ce61-a52c-4036-b5b6-5c50783db51f@lucifer.local/ [1]
Link: https://lore.kernel.org/all/CAG48ez11zi-1jicHUZtLhyoNPGGVB+ROeAJCUw48bsjk4bbEkA@mail.gmail.com/ [2]
Link: https://lkml.kernel.org/r/20250607220150.2980-1-21cnbao@gmail.com
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Tangquan Zheng <zhengtangquan@oppo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Bug: 425827225
Change-Id: I9485baaf04a09d84e89157dab9bc9185f091947d
(cherry picked from commit ff6e99ed6d49725fa50862bdedfc9468c592c9fc
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable)
[oven: Moved changes in madvise_do_behavior out to do_madvise and removed
changes in vector_madvise. Because these functions haven't been
introduced in old kernel. Resolved other minor conflict as well.]
Signed-off-by: Oven <liyangouwen1@oppo.com>
---
 mm/madvise.c | 184 +++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 141 insertions(+), 43 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index 0b8897b7071a..845d9c6e63ed 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -48,36 +48,19 @@ struct madvise_walk_private {
 	void *private;
 };
 
+enum madvise_lock_mode {
+	MADVISE_NO_LOCK,
+	MADVISE_MMAP_READ_LOCK,
+	MADVISE_MMAP_WRITE_LOCK,
+	MADVISE_VMA_READ_LOCK,
+};
+
 struct madvise_behavior {
 	int behavior;
 	struct mmu_gather *tlb;
+	enum madvise_lock_mode lock_mode;
 };
 
-/*
- * Any behaviour which results in changes to the vma->vm_flags needs to
- * take mmap_lock for writing. Others, which simply traverse vmas, need
- * to only take it for reading.
- */
-static int madvise_need_mmap_write(int behavior)
-{
-	switch (behavior) {
-	case MADV_REMOVE:
-	case MADV_WILLNEED:
-	case MADV_DONTNEED:
-	case MADV_DONTNEED_LOCKED:
-	case MADV_COLD:
-	case MADV_PAGEOUT:
-	case MADV_FREE:
-	case MADV_POPULATE_READ:
-	case MADV_POPULATE_WRITE:
-	case MADV_COLLAPSE:
-		return 0;
-	default:
-		/* be safe, default to 1. list exceptions explicitly */
-		return 1;
-	}
-}
-
 #ifdef CONFIG_ANON_VMA_NAME
 struct anon_vma_name *anon_vma_name_alloc(const char *name)
 {
@@ -1306,6 +1289,44 @@ static bool process_madvise_behavior_valid(int behavior)
 	}
 }
 
+/*
+ * Try to acquire a VMA read lock if possible.
+ *
+ * We only support this lock over a single VMA, which the input range must
+ * span either partially or fully.
+ *
+ * This function always returns with an appropriate lock held. If a VMA read
+ * lock could be acquired, we return the locked VMA.
+ *
+ * If a VMA read lock could not be acquired, we return NULL and expect caller to
+ * fallback to mmap lock behaviour.
+ */
+static struct vm_area_struct *try_vma_read_lock(struct mm_struct *mm,
+		struct madvise_behavior *madv_behavior,
+		unsigned long start, unsigned long end)
+{
+	struct vm_area_struct *vma;
+
+	vma = lock_vma_under_rcu(mm, start);
+	if (!vma)
+		goto take_mmap_read_lock;
+	/*
+	 * Must span only a single VMA; uffd and remote processes are
+	 * unsupported.
+	 */
+	if (end > vma->vm_end || current->mm != mm ||
+	    userfaultfd_armed(vma)) {
+		vma_end_read(vma);
+		goto take_mmap_read_lock;
+	}
+	return vma;
+
+take_mmap_read_lock:
+	mmap_read_lock(mm);
+	madv_behavior->lock_mode = MADVISE_MMAP_READ_LOCK;
+	return NULL;
+}
+
 /*
  * Walk the vmas in range [start,end), and call the visit function on each one.
  * The visit function will get start and end parameters that cover the overlap
@@ -1316,7 +1337,8 @@ static bool process_madvise_behavior_valid(int behavior)
  */
 static
 int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
-		      unsigned long end, void *arg,
+		      unsigned long end, struct madvise_behavior *madv_behavior,
+		      void *arg,
 		      int (*visit)(struct vm_area_struct *vma,
 				   struct vm_area_struct **prev, unsigned long start,
 				   unsigned long end, void *arg))
@@ -1325,6 +1347,20 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
 	struct vm_area_struct *prev;
 	unsigned long tmp;
 	int unmapped_error = 0;
+	int error;
+
+	/*
+	 * If VMA read lock is supported, apply madvise to a single VMA
+	 * tentatively, avoiding walking VMAs.
+	 */
+	if (madv_behavior && madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK) {
+		vma = try_vma_read_lock(mm, madv_behavior, start, end);
+		if (vma) {
+			error = visit(vma, &prev, start, end, arg);
+			vma_end_read(vma);
+			return error;
+		}
+	}
 
 	/*
 	 * If the interval [start,end) covers some unmapped address
@@ -1336,8 +1372,6 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
 		prev = vma;
 
 	for (;;) {
-		int error;
-
 		/* Still start < end. */
 		if (!vma)
 			return -ENOMEM;
@@ -1419,7 +1453,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
 	if (end == start)
 		return 0;
 
-	return madvise_walk_vmas(mm, start, end, anon_name,
+	return madvise_walk_vmas(mm, start, end, NULL, anon_name,
 				 madvise_vma_anon_name);
 }
 #endif /* CONFIG_ANON_VMA_NAME */
@@ -1442,29 +1476,93 @@ static bool is_memory_failure(int behavior)
 }
 #endif
 
-static int madvise_lock(struct mm_struct *mm, int behavior)
+/*
+ * Any behaviour which results in changes to the vma->vm_flags needs to
+ * take mmap_lock for writing. Others, which simply traverse vmas, need
+ * to only take it for reading.
+ */
+static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavior)
 {
-	if (is_memory_failure(behavior))
-		return 0;
+	int behavior = madv_behavior->behavior;
 
-	if (madvise_need_mmap_write(behavior)) {
+	if (is_memory_failure(behavior))
+		return MADVISE_NO_LOCK;
+
+	switch (behavior) {
+	case MADV_REMOVE:
+	case MADV_WILLNEED:
+	case MADV_COLD:
+	case MADV_PAGEOUT:
+	case MADV_FREE:
+	case MADV_POPULATE_READ:
+	case MADV_POPULATE_WRITE:
+	case MADV_COLLAPSE:
+		return MADVISE_MMAP_READ_LOCK;
+	case MADV_DONTNEED:
+	case MADV_DONTNEED_LOCKED:
+		return MADVISE_VMA_READ_LOCK;
+	default:
+		return MADVISE_MMAP_WRITE_LOCK;
+	}
+}
+
+static int madvise_lock(struct mm_struct *mm,
+		struct madvise_behavior *madv_behavior)
+{
+	enum madvise_lock_mode lock_mode = get_lock_mode(madv_behavior);
+
+	switch (lock_mode) {
+	case MADVISE_NO_LOCK:
+		break;
+	case MADVISE_MMAP_WRITE_LOCK:
 		if (mmap_write_lock_killable(mm))
 			return -EINTR;
-	} else {
+		break;
+	case MADVISE_MMAP_READ_LOCK:
 		mmap_read_lock(mm);
+		break;
+	case MADVISE_VMA_READ_LOCK:
+		/* We will acquire the lock per-VMA in madvise_walk_vmas(). */
+		break;
 	}
+
+	madv_behavior->lock_mode = lock_mode;
 	return 0;
 }
 
-static void madvise_unlock(struct mm_struct *mm, int behavior)
+static void madvise_unlock(struct mm_struct *mm,
+		struct madvise_behavior *madv_behavior)
 {
-	if (is_memory_failure(behavior))
+	switch (madv_behavior->lock_mode) {
+	case  MADVISE_NO_LOCK:
 		return;
-
-	if (madvise_need_mmap_write(behavior))
+	case MADVISE_MMAP_WRITE_LOCK:
 		mmap_write_unlock(mm);
-	else
+		break;
+	case MADVISE_MMAP_READ_LOCK:
 		mmap_read_unlock(mm);
+		break;
+	case MADVISE_VMA_READ_LOCK:
+		/* We will drop the lock per-VMA in madvise_walk_vmas(). */
+		break;
+	}
+
+	madv_behavior->lock_mode = MADVISE_NO_LOCK;
+}
+
+/*
+ * untagged_addr_remote() assumes mmap_lock is already held. On
+ * architectures like x86 and RISC-V, tagging is tricky because each
+ * mm may have a different tagging mask. However, we might only hold
+ * the per-VMA lock (currently only local processes are supported),
+ * so untagged_addr is used to avoid the mmap_lock assertion for
+ * local processes.
+ */
+static inline unsigned long get_untagged_addr(struct mm_struct *mm,
+		unsigned long start)
+{
+	return current->mm == mm ? untagged_addr(start) :
+				   untagged_addr_remote(mm, start);
 }
 
 /*
@@ -1565,7 +1663,7 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
 	if (end == start)
 		return 0;
 
-	error = madvise_lock(mm, behavior);
+	error = madvise_lock(mm, &madv_behavior);
 	if (error)
 		return error;
 
@@ -1574,15 +1672,15 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
 		return madvise_inject_error(behavior, start, start + len_in);
 #endif
 
-	start = untagged_addr_remote(mm, start);
+	start = get_untagged_addr(mm, start);
 	end = start + len;
 
 	blk_start_plug(&plug);
 	error = madvise_walk_vmas(mm, start, end, &madv_behavior,
-			madvise_vma_behavior);
+			&madv_behavior, madvise_vma_behavior);
 	blk_finish_plug(&plug);
 
-	madvise_unlock(mm, behavior);
+	madvise_unlock(mm, &madv_behavior);
 
 	return error;
 }

From 75a0fcbfdf35e712350c1c411c0735af125ab124 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 26 Jun 2025 20:10:38 +0000
Subject: [PATCH 8/9] ANDROID: look up vma under RCU in linker_ctx()

madvise_dontneed_single_vma() calls linker_ctx() to detect whether the
madvise was initiated by the dynamic linker. This function requires
mmap_lock in order to lookup the vma, however with recent changes we
do not hold mmap_lock while executing MADV_DONTNEED. Lookup the vma
under RCU instead to avoid lockdep warning.

Bug: 425827225
Change-Id: Ie5e0243f359b96292d4f32ee3299050b871dc6c5
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 mm/pgsize_migration.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/mm/pgsize_migration.c b/mm/pgsize_migration.c
index f72240b0de79..07dd1f3fa7d8 100644
--- a/mm/pgsize_migration.c
+++ b/mm/pgsize_migration.c
@@ -167,8 +167,6 @@ static __always_inline bool str_has_suffix(const char *str, const char *suffix)
  * VMAs of the current task.
  *
  * Returns true if in linker context, otherwise false.
- *
- * Caller must hold mmap lock in read mode.
  */
 static inline bool linker_ctx(void)
 {
@@ -180,14 +178,14 @@ static inline bool linker_ctx(void)
 	if (!regs)
 		return false;
 
-	vma = find_vma(mm, instruction_pointer(regs));
+	vma = lock_vma_under_rcu(mm, instruction_pointer(regs));
 
 	/* Current execution context, the VMA must be present */
 	BUG_ON(!vma);
 
 	file = vma->vm_file;
 	if (!file)
-		return false;
+		goto out;
 
 	if ((vma->vm_flags & VM_EXEC)) {
 		char buf[64];
@@ -205,10 +203,13 @@ static inline bool linker_ctx(void)
 		 *
 		 * Check the base name (linker64).
 		 */
-		if (!strcmp(kbasename(path), "linker64"))
+		if (!strcmp(kbasename(path), "linker64")) {
+			vma_end_read(vma);
 			return true;
+		}
 	}
-
+out:
+	vma_end_read(vma);
 	return false;
 }
 

From 464ddce4070dfa9633c25d958adffaa3f3ef5a75 Mon Sep 17 00:00:00 2001
From: Kalesh Singh <kaleshsingh@google.com>
Date: Thu, 26 Jun 2025 13:25:58 -0700
Subject: [PATCH 9/9] ANDROID: mthp: Prevent TAO non-movable allocations from
 movable zone

aosp/I30ac33034f0ff697a4330ef752babf94d4e234f5 removed the ability to
allocate non-movable __GPF_COMP allocations from TAO zones as they
could fallback to the movable-zone; causing various issues.

This was reintroduced by aosp/I2fdfc4df8b03daa96fd6c2c8c6630d26a8509ad0
iff the movable zone is not enabled; adding additional checks to verify
this.

However the case of movable_node command line parameter was missed; when
this is set memory blocks can be onlined into the movable-zone.

Add a check for !movable_node_is_enabled() to avoid such __GPF_COMP
allocations from TAO (virtual zones) if it's possible that memory
blocks can come online to the movable-zone.

Bug: 427924381
Bug: 313807618
Bug: 353906885
Change-Id: If2ebd4d7f4badd99599b01939cd89d0cba9b3fb5
Signed-off-by: Kalesh Singh <kaleshsingh@google.com>
---
 include/linux/gfp.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 43dc4463388b..808caa5c170c 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -139,9 +139,9 @@ static inline enum zone_type __gfp_zone(gfp_t flags)
 	if (z == ZONE_MOVABLE)
 		return LAST_VIRT_ZONE;
 
-	 /* Allow dma-buf etc to use virtual zones */
+	 /* Allow dma-buf etc to use virtual zones, if there is no movable zone */
 	if ((flags & __GFP_COMP) && (flags & __GFP_HIGHMEM) &&
-	    !static_branch_unlikely(&movablecore_enabled))
+	    !static_branch_unlikely(&movablecore_enabled) && !movable_node_is_enabled())
 		return LAST_VIRT_ZONE;
 
 	return z;