diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg index 0de7176f5e86..d8ed1e76d857 100644 --- a/android/abi_gki_aarch64.stg +++ b/android/abi_gki_aarch64.stg @@ -359430,6 +359430,24 @@ elf_symbol { type_id: 0x9bc8472e full_name: "__traceiter_android_rvh_alloc_and_link_pwqs" } +elf_symbol { + id: 0xc0fd1a1f + name: "__traceiter_android_rvh_alloc_pages_adjust_wmark" + is_defined: true + symbol_type: FUNCTION + crc: 0x6a18478a + type_id: 0x9870a448 + full_name: "__traceiter_android_rvh_alloc_pages_adjust_wmark" +} +elf_symbol { + id: 0x6eed3175 + name: "__traceiter_android_rvh_alloc_pages_reset_wmark" + is_defined: true + symbol_type: FUNCTION + crc: 0x414d4c97 + type_id: 0x9870a59a + full_name: "__traceiter_android_rvh_alloc_pages_reset_wmark" +} elf_symbol { id: 0xef79dd4d name: "__traceiter_android_rvh_alloc_workqueue" @@ -366522,6 +366540,24 @@ elf_symbol { type_id: 0x18ccbd2c full_name: "__tracepoint_android_rvh_alloc_and_link_pwqs" } +elf_symbol { + id: 0x89ff3495 + name: "__tracepoint_android_rvh_alloc_pages_adjust_wmark" + is_defined: true + symbol_type: OBJECT + crc: 0xc63d662f + type_id: 0x18ccbd2c + full_name: "__tracepoint_android_rvh_alloc_pages_adjust_wmark" +} +elf_symbol { + id: 0xab6e1e0f + name: "__tracepoint_android_rvh_alloc_pages_reset_wmark" + is_defined: true + symbol_type: OBJECT + crc: 0xdbce1a35 + type_id: 0x18ccbd2c + full_name: "__tracepoint_android_rvh_alloc_pages_reset_wmark" +} elf_symbol { id: 0x0b219d2b name: "__tracepoint_android_rvh_alloc_workqueue" @@ -436771,6 +436807,8 @@ interface { symbol_id: 0xb42422d5 symbol_id: 0xb3d70eab symbol_id: 0x9ca1a40f + symbol_id: 0xc0fd1a1f + symbol_id: 0x6eed3175 symbol_id: 0xef79dd4d symbol_id: 0x0b48afa1 symbol_id: 0xa927338c @@ -437559,6 +437597,8 @@ interface { symbol_id: 0x4b7a8fd7 symbol_id: 0xcd36f539 symbol_id: 0x33f0c37d + symbol_id: 0x89ff3495 + symbol_id: 0xab6e1e0f symbol_id: 0x0b219d2b symbol_id: 0x748c1fd7 symbol_id: 0xcb42202e diff --git a/android/abi_gki_aarch64_vivo b/android/abi_gki_aarch64_vivo index b8f2d60402fd..093e3588d263 100644 --- a/android/abi_gki_aarch64_vivo +++ b/android/abi_gki_aarch64_vivo @@ -108,9 +108,11 @@ __traceiter_android_vh_account_process_tick_gran __traceiter_android_vh_adjust_kvmalloc_flags __traceiter_android_vh_alloc_pages_adjust_wmark + __traceiter_android_rvh_alloc_pages_adjust_wmark __traceiter_android_vh_alloc_pages_failure_bypass __traceiter_android_vh_alloc_pages_reclaim_bypass __traceiter_android_vh_alloc_pages_reset_wmark + __traceiter_android_rvh_alloc_pages_reset_wmark __traceiter_android_vh_alter_mutex_list_add __traceiter_android_vh_alter_rwsem_list_add __traceiter_android_vh_bd_link_disk_holder @@ -241,9 +243,11 @@ __tracepoint_android_vh_account_process_tick_gran __tracepoint_android_vh_adjust_kvmalloc_flags __tracepoint_android_vh_alloc_pages_adjust_wmark + __tracepoint_android_rvh_alloc_pages_adjust_wmark __tracepoint_android_vh_alloc_pages_failure_bypass __tracepoint_android_vh_alloc_pages_reclaim_bypass __tracepoint_android_vh_alloc_pages_reset_wmark + __tracepoint_android_rvh_alloc_pages_reset_wmark __tracepoint_android_vh_alter_mutex_list_add __tracepoint_android_vh_alter_rwsem_list_add __tracepoint_android_vh_bd_link_disk_holder diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index 4523cc6f2725..61b524129366 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -65,6 +65,7 @@ static void __init sort_memblock_regions(void) static int __init register_memblock_regions(void) { struct memblock_region *reg; + bool pvmfw_in_mem = false; for_each_mem_region(reg) { if (*hyp_memblock_nr_ptr >= HYP_MEMBLOCK_REGIONS) @@ -72,6 +73,27 @@ static int __init register_memblock_regions(void) hyp_memory[*hyp_memblock_nr_ptr] = *reg; (*hyp_memblock_nr_ptr)++; + + if (!*pvmfw_size || pvmfw_in_mem || + !memblock_addrs_overlap(reg->base, reg->size, *pvmfw_base, *pvmfw_size)) + continue; + /* If the pvmfw region overlaps a memblock, it must be a subset */ + if (*pvmfw_base < reg->base || + (*pvmfw_base + *pvmfw_size) > (reg->base + reg->size)) + return -EINVAL; + pvmfw_in_mem = true; + } + + if (*pvmfw_size && !pvmfw_in_mem) { + if (*hyp_memblock_nr_ptr >= HYP_MEMBLOCK_REGIONS) + return -ENOMEM; + + hyp_memory[*hyp_memblock_nr_ptr] = (struct memblock_region) { + .base = *pvmfw_base, + .size = *pvmfw_size, + .flags = MEMBLOCK_NOMAP, + }; + (*hyp_memblock_nr_ptr)++; } sort_memblock_regions(); diff --git a/drivers/android/vendor_hooks.c b/drivers/android/vendor_hooks.c index eaea41831d1f..d3f7ff4fde56 100644 --- a/drivers/android/vendor_hooks.c +++ b/drivers/android/vendor_hooks.c @@ -463,6 +463,8 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_pages_reclaim_bypass); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_pages_failure_bypass); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_pages_adjust_wmark); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_pages_reset_wmark); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_alloc_pages_adjust_wmark); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_alloc_pages_reset_wmark); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_watermark_fast_ok); EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_fiq_dump); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_swapmem_gather_init); diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 43dc4463388b..808caa5c170c 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -139,9 +139,9 @@ static inline enum zone_type __gfp_zone(gfp_t flags) if (z == ZONE_MOVABLE) return LAST_VIRT_ZONE; - /* Allow dma-buf etc to use virtual zones */ + /* Allow dma-buf etc to use virtual zones, if there is no movable zone */ if ((flags & __GFP_COMP) && (flags & __GFP_HIGHMEM) && - !static_branch_unlikely(&movablecore_enabled)) + !static_branch_unlikely(&movablecore_enabled) && !movable_node_is_enabled()) return LAST_VIRT_ZONE; return z; diff --git a/include/trace/hooks/mm.h b/include/trace/hooks/mm.h index 00df4c5ea263..65eb40c00944 100644 --- a/include/trace/hooks/mm.h +++ b/include/trace/hooks/mm.h @@ -156,6 +156,15 @@ DECLARE_HOOK(android_vh_alloc_pages_reset_wmark, unsigned long direct_reclaim_retries), TP_ARGS(gfp_mask, order, alloc_flags, did_some_progress, no_progress_loops, direct_reclaim_retries)); +DECLARE_RESTRICTED_HOOK(android_rvh_alloc_pages_adjust_wmark, + TP_PROTO(gfp_t gfp_mask, int order, int *alloc_flags), + TP_ARGS(gfp_mask, order, alloc_flags), 3); +DECLARE_RESTRICTED_HOOK(android_rvh_alloc_pages_reset_wmark, + TP_PROTO(gfp_t gfp_mask, int order, int *alloc_flags, + unsigned long *did_some_progress, int *no_progress_loops, + unsigned long direct_reclaim_retries), + TP_ARGS(gfp_mask, order, alloc_flags, did_some_progress, + no_progress_loops, direct_reclaim_retries), 6); DECLARE_HOOK(android_vh_unreserve_highatomic_bypass, TP_PROTO(bool force, struct zone *zone, bool *skip_unreserve_highatomic), TP_ARGS(force, zone, skip_unreserve_highatomic)); diff --git a/mm/madvise.c b/mm/madvise.c index b36a6a32a1e3..845d9c6e63ed 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -48,30 +48,18 @@ struct madvise_walk_private { void *private; }; -/* - * Any behaviour which results in changes to the vma->vm_flags needs to - * take mmap_lock for writing. Others, which simply traverse vmas, need - * to only take it for reading. - */ -static int madvise_need_mmap_write(int behavior) -{ - switch (behavior) { - case MADV_REMOVE: - case MADV_WILLNEED: - case MADV_DONTNEED: - case MADV_DONTNEED_LOCKED: - case MADV_COLD: - case MADV_PAGEOUT: - case MADV_FREE: - case MADV_POPULATE_READ: - case MADV_POPULATE_WRITE: - case MADV_COLLAPSE: - return 0; - default: - /* be safe, default to 1. list exceptions explicitly */ - return 1; - } -} +enum madvise_lock_mode { + MADVISE_NO_LOCK, + MADVISE_MMAP_READ_LOCK, + MADVISE_MMAP_WRITE_LOCK, + MADVISE_VMA_READ_LOCK, +}; + +struct madvise_behavior { + int behavior; + struct mmu_gather *tlb; + enum madvise_lock_mode lock_mode; +}; #ifdef CONFIG_ANON_VMA_NAME struct anon_vma_name *anon_vma_name_alloc(const char *name) @@ -941,8 +929,9 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, static long madvise_dontneed_free(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, - int behavior) + struct madvise_behavior *madv_behavior) { + int behavior = madv_behavior->behavior; struct mm_struct *mm = vma->vm_mm; *prev = vma; @@ -1102,8 +1091,10 @@ static long madvise_remove(struct vm_area_struct *vma, static int madvise_vma_behavior(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, - unsigned long behavior) + void *behavior_arg) { + struct madvise_behavior *arg = behavior_arg; + int behavior = arg->behavior; int error; struct anon_vma_name *anon_name; unsigned long new_flags = vma->vm_flags; @@ -1123,7 +1114,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, case MADV_FREE: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: - return madvise_dontneed_free(vma, prev, start, end, behavior); + return madvise_dontneed_free(vma, prev, start, end, arg); case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: return madvise_populate(vma, prev, start, end, behavior); @@ -1298,6 +1289,44 @@ static bool process_madvise_behavior_valid(int behavior) } } +/* + * Try to acquire a VMA read lock if possible. + * + * We only support this lock over a single VMA, which the input range must + * span either partially or fully. + * + * This function always returns with an appropriate lock held. If a VMA read + * lock could be acquired, we return the locked VMA. + * + * If a VMA read lock could not be acquired, we return NULL and expect caller to + * fallback to mmap lock behaviour. + */ +static struct vm_area_struct *try_vma_read_lock(struct mm_struct *mm, + struct madvise_behavior *madv_behavior, + unsigned long start, unsigned long end) +{ + struct vm_area_struct *vma; + + vma = lock_vma_under_rcu(mm, start); + if (!vma) + goto take_mmap_read_lock; + /* + * Must span only a single VMA; uffd and remote processes are + * unsupported. + */ + if (end > vma->vm_end || current->mm != mm || + userfaultfd_armed(vma)) { + vma_end_read(vma); + goto take_mmap_read_lock; + } + return vma; + +take_mmap_read_lock: + mmap_read_lock(mm); + madv_behavior->lock_mode = MADVISE_MMAP_READ_LOCK; + return NULL; +} + /* * Walk the vmas in range [start,end), and call the visit function on each one. * The visit function will get start and end parameters that cover the overlap @@ -1308,15 +1337,30 @@ static bool process_madvise_behavior_valid(int behavior) */ static int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, - unsigned long end, unsigned long arg, + unsigned long end, struct madvise_behavior *madv_behavior, + void *arg, int (*visit)(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, - unsigned long end, unsigned long arg)) + unsigned long end, void *arg)) { struct vm_area_struct *vma; struct vm_area_struct *prev; unsigned long tmp; int unmapped_error = 0; + int error; + + /* + * If VMA read lock is supported, apply madvise to a single VMA + * tentatively, avoiding walking VMAs. + */ + if (madv_behavior && madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK) { + vma = try_vma_read_lock(mm, madv_behavior, start, end); + if (vma) { + error = visit(vma, &prev, start, end, arg); + vma_end_read(vma); + return error; + } + } /* * If the interval [start,end) covers some unmapped address @@ -1328,8 +1372,6 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, prev = vma; for (;;) { - int error; - /* Still start < end. */ if (!vma) return -ENOMEM; @@ -1369,7 +1411,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, static int madvise_vma_anon_name(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, - unsigned long anon_name) + void *anon_name) { int error; @@ -1379,7 +1421,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma, trace_android_vh_update_vma_flags(vma); error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, - (struct anon_vma_name *)anon_name); + anon_name); /* * madvise() returns EAGAIN if kernel resources, such as @@ -1411,10 +1453,118 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, if (end == start) return 0; - return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name, + return madvise_walk_vmas(mm, start, end, NULL, anon_name, madvise_vma_anon_name); } #endif /* CONFIG_ANON_VMA_NAME */ + +#ifdef CONFIG_MEMORY_FAILURE +static bool is_memory_failure(int behavior) +{ + switch (behavior) { + case MADV_HWPOISON: + case MADV_SOFT_OFFLINE: + return true; + default: + return false; + } +} +#else +static bool is_memory_failure(int behavior) +{ + return false; +} +#endif + +/* + * Any behaviour which results in changes to the vma->vm_flags needs to + * take mmap_lock for writing. Others, which simply traverse vmas, need + * to only take it for reading. + */ +static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavior) +{ + int behavior = madv_behavior->behavior; + + if (is_memory_failure(behavior)) + return MADVISE_NO_LOCK; + + switch (behavior) { + case MADV_REMOVE: + case MADV_WILLNEED: + case MADV_COLD: + case MADV_PAGEOUT: + case MADV_FREE: + case MADV_POPULATE_READ: + case MADV_POPULATE_WRITE: + case MADV_COLLAPSE: + return MADVISE_MMAP_READ_LOCK; + case MADV_DONTNEED: + case MADV_DONTNEED_LOCKED: + return MADVISE_VMA_READ_LOCK; + default: + return MADVISE_MMAP_WRITE_LOCK; + } +} + +static int madvise_lock(struct mm_struct *mm, + struct madvise_behavior *madv_behavior) +{ + enum madvise_lock_mode lock_mode = get_lock_mode(madv_behavior); + + switch (lock_mode) { + case MADVISE_NO_LOCK: + break; + case MADVISE_MMAP_WRITE_LOCK: + if (mmap_write_lock_killable(mm)) + return -EINTR; + break; + case MADVISE_MMAP_READ_LOCK: + mmap_read_lock(mm); + break; + case MADVISE_VMA_READ_LOCK: + /* We will acquire the lock per-VMA in madvise_walk_vmas(). */ + break; + } + + madv_behavior->lock_mode = lock_mode; + return 0; +} + +static void madvise_unlock(struct mm_struct *mm, + struct madvise_behavior *madv_behavior) +{ + switch (madv_behavior->lock_mode) { + case MADVISE_NO_LOCK: + return; + case MADVISE_MMAP_WRITE_LOCK: + mmap_write_unlock(mm); + break; + case MADVISE_MMAP_READ_LOCK: + mmap_read_unlock(mm); + break; + case MADVISE_VMA_READ_LOCK: + /* We will drop the lock per-VMA in madvise_walk_vmas(). */ + break; + } + + madv_behavior->lock_mode = MADVISE_NO_LOCK; +} + +/* + * untagged_addr_remote() assumes mmap_lock is already held. On + * architectures like x86 and RISC-V, tagging is tricky because each + * mm may have a different tagging mask. However, we might only hold + * the per-VMA lock (currently only local processes are supported), + * so untagged_addr is used to avoid the mmap_lock assertion for + * local processes. + */ +static inline unsigned long get_untagged_addr(struct mm_struct *mm, + unsigned long start) +{ + return current->mm == mm ? untagged_addr(start) : + untagged_addr_remote(mm, start); +} + /* * The madvise(2) system call. * @@ -1491,9 +1641,9 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh { unsigned long end; int error; - int write; size_t len; struct blk_plug plug; + struct madvise_behavior madv_behavior = {.behavior = behavior}; if (!madvise_behavior_valid(behavior)) return -EINVAL; @@ -1513,31 +1663,24 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh if (end == start) return 0; + error = madvise_lock(mm, &madv_behavior); + if (error) + return error; + #ifdef CONFIG_MEMORY_FAILURE if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) return madvise_inject_error(behavior, start, start + len_in); #endif - write = madvise_need_mmap_write(behavior); - if (write) { - if (mmap_write_lock_killable(mm)) - return -EINTR; - } else { - mmap_read_lock(mm); - } - - start = untagged_addr_remote(mm, start); + start = get_untagged_addr(mm, start); end = start + len; blk_start_plug(&plug); - error = madvise_walk_vmas(mm, start, end, behavior, - madvise_vma_behavior); + error = madvise_walk_vmas(mm, start, end, &madv_behavior, + &madv_behavior, madvise_vma_behavior); blk_finish_plug(&plug); - if (write) - mmap_write_unlock(mm); - else - mmap_read_unlock(mm); + madvise_unlock(mm, &madv_behavior); return error; } diff --git a/mm/memory.c b/mm/memory.c index dfbd0a2795db..a04841dc9291 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3962,7 +3962,7 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf) struct folio *folio; swp_entry_t entry; - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE|__GFP_CMA, 0, vma, vmf->address, false); if (!folio) return NULL; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b42afcd0d3c3..152b0424fcbf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4438,8 +4438,15 @@ restart: if (alloc_flags & ALLOC_KSWAPD) wake_all_kswapds(order, gfp_mask, ac); - if (can_direct_reclaim && !direct_reclaim_retries && !(current->flags & PF_MEMALLOC)) + if (can_direct_reclaim && !direct_reclaim_retries && !(current->flags & PF_MEMALLOC)) { + /* + * The trace_android_vh_alloc_pages_adjust_wmark() has been deprecated + * because it cannot be used in a CPU offline or non-atomic context, + * please use trace_android_rvh_alloc_pages_adjust_wmark(). + */ trace_android_vh_alloc_pages_adjust_wmark(gfp_mask, order, &alloc_flags); + trace_android_rvh_alloc_pages_adjust_wmark(gfp_mask, order, &alloc_flags); + } /* * The adjusted alloc_flags might result in immediate success, so try @@ -4587,8 +4594,15 @@ retry: !(gfp_mask & __GFP_RETRY_MAYFAIL))) goto nopage; + /* + * The trace_android_vh_alloc_pages_reset_wmark() has been deprecated + * because it cannot be used in a CPU offline or non-atomic context, + * please use trace_android_rvh_alloc_pages_reset_wmark(). + */ trace_android_vh_alloc_pages_reset_wmark(gfp_mask, order, &alloc_flags, &did_some_progress, &no_progress_loops, direct_reclaim_retries); + trace_android_rvh_alloc_pages_reset_wmark(gfp_mask, order, + &alloc_flags, &did_some_progress, &no_progress_loops, direct_reclaim_retries); if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, did_some_progress > 0, &no_progress_loops)) diff --git a/mm/pgsize_migration.c b/mm/pgsize_migration.c index f72240b0de79..07dd1f3fa7d8 100644 --- a/mm/pgsize_migration.c +++ b/mm/pgsize_migration.c @@ -167,8 +167,6 @@ static __always_inline bool str_has_suffix(const char *str, const char *suffix) * VMAs of the current task. * * Returns true if in linker context, otherwise false. - * - * Caller must hold mmap lock in read mode. */ static inline bool linker_ctx(void) { @@ -180,14 +178,14 @@ static inline bool linker_ctx(void) if (!regs) return false; - vma = find_vma(mm, instruction_pointer(regs)); + vma = lock_vma_under_rcu(mm, instruction_pointer(regs)); /* Current execution context, the VMA must be present */ BUG_ON(!vma); file = vma->vm_file; if (!file) - return false; + goto out; if ((vma->vm_flags & VM_EXEC)) { char buf[64]; @@ -205,10 +203,13 @@ static inline bool linker_ctx(void) * * Check the base name (linker64). */ - if (!strcmp(kbasename(path), "linker64")) + if (!strcmp(kbasename(path), "linker64")) { + vma_end_read(vma); return true; + } } - +out: + vma_end_read(vma); return false; }