From 066872d13d0c0b076785f0b794b650de0941c1c9 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Fri, 9 Feb 2024 01:36:02 -0700 Subject: [PATCH] FROMLIST: BACKPORT: THP shattering: the reverse of collapsing In contrast to split, shatter migrates occupied pages in a partially mapped THP to a bunch of base folios. IOW, unlike split done in place, shatter is the exact opposite of collapse. The advantage of shattering is that it keeps the original THP intact. The cost of copying during the migration is not a side effect, but rather by design, since splitting is considered a discouraged behavior. In retail terms, the return of a purchase is charged with a restocking fee and the original goods can be resold. THPs from ZONE_NOMERGE can only be shattered, since they cannot be split or merged. THPs from ZONE_NOSPLIT can be shattered or split (the latter requires [1]), if they are above the minimum order. [1] https://lore.kernel.org/20240226205534.1603748-1-zi.yan@sent.com/ Change-Id: I7637124bb1ede775dba7b1d363d53256f337851f Signed-off-by: Yu Zhao Link: https://lore.kernel.org/r/20240229183436.4110845-3-yuzhao@google.com/ Bug: 313807618 Signed-off-by: Kalesh Singh --- include/linux/memcontrol.h | 5 + include/linux/mm_inline.h | 24 +++ include/linux/mm_types.h | 8 +- include/linux/vm_event_item.h | 3 + mm/huge_memory.c | 278 ++++++++++++++++++++++++++++------ mm/internal.h | 39 +++++ mm/madvise.c | 6 +- mm/memcontrol.c | 47 ++++++ mm/memory-failure.c | 2 +- mm/migrate.c | 70 ++++++--- mm/page_alloc.c | 4 + mm/rmap.c | 4 + mm/shmem.c | 4 +- mm/truncate.c | 6 +- mm/vmscan.c | 9 ++ mm/vmstat.c | 3 + 16 files changed, 439 insertions(+), 73 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0e35b72e7cb1..dfabbcb8e7ac 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1155,6 +1155,7 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, } void split_page_memcg(struct page *head, unsigned int nr); +void folio_copy_memcg(struct folio *folio); unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, @@ -1584,6 +1585,10 @@ static inline void split_page_memcg(struct page *head, unsigned int nr) { } +static inline void folio_copy_memcg(struct folio *folio) +{ +} + static inline unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index daeef147501b..66af3d07d94a 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -220,6 +220,25 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); } +static inline bool lru_gen_add_dst(struct lruvec *lruvec, struct folio *dst) +{ + int gen = folio_lru_gen(dst); + int type = folio_is_file_lru(dst); + int zone = folio_zonenum(dst); + struct lru_gen_folio *lrugen = &lruvec->lrugen; + + if (gen < 0) + return false; + + lockdep_assert_held(&lruvec->lru_lock); + VM_WARN_ON_ONCE_FOLIO(folio_lruvec(dst) != lruvec, dst); + + list_add_tail(&dst->lru, &lrugen->folios[gen][type][zone]); + lru_gen_update_size(lruvec, dst, -1, gen); + + return true; +} + static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { unsigned long seq; @@ -305,6 +324,11 @@ static inline bool lru_gen_in_fault(void) return false; } +static inline bool lru_gen_add_dst(struct lruvec *lruvec, struct folio *dst) +{ + return false; +} + static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { return false; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 36765081b8c1..937ef6c33501 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -352,14 +352,19 @@ struct folio { struct { unsigned long _flags_1; unsigned long _head_1; - unsigned long _folio_avail; /* public: */ atomic_t _entire_mapcount; atomic_t _nr_pages_mapped; atomic_t _pincount; #ifdef CONFIG_64BIT + unsigned int __padding; unsigned int _folio_nr_pages; #endif + union { + unsigned long _private_1; + unsigned long *_dst_ul; + struct page **_dst_pp; + }; /* private: the union with struct page is transitional */ }; struct page __page_1; @@ -405,6 +410,7 @@ FOLIO_MATCH(memcg_data, memcg_data); offsetof(struct page, pg) + sizeof(struct page)) FOLIO_MATCH(flags, _flags_1); FOLIO_MATCH(compound_head, _head_1); +FOLIO_MATCH(private, _private_1); #undef FOLIO_MATCH #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct folio, fl) == \ diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 86a33075dfd5..faa993839c22 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -108,6 +108,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_SPLIT_PAGE_FAILED, THP_DEFERRED_SPLIT_PAGE, THP_SPLIT_PMD, + THP_SHATTER_PAGE, + THP_SHATTER_PAGE_FAILED, + THP_SHATTER_PAGE_DISCARDED, THP_SCAN_EXCEED_NONE_PTE, THP_SCAN_EXCEED_SWAP_PTE, THP_SCAN_EXCEED_SHARED_PTE, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index dffa327edbcd..0596313f4e5e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2629,6 +2629,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = pte_swp_mksoft_dirty(entry); if (uffd_wp) entry = pte_swp_mkuffd_wp(entry); + if (vma->vm_flags & VM_LOCKED) + set_src_usage(page + i, SRC_PAGE_MLOCKED); + else + set_src_usage(page + i, SRC_PAGE_MAPPED); VM_WARN_ON(!pte_none(ptep_get(pte + i))); set_pte_at(mm, addr, pte + i, entry); @@ -2780,6 +2784,156 @@ static void remap_page(struct folio *folio, unsigned long nr) } } +static int prep_to_unmap(struct folio *src) +{ + int nr_pages = folio_nr_pages(src); + + if (folio_can_split(src)) + return 0; + + WARN_ON_ONCE(src->_dst_pp); + + src->_dst_pp = kcalloc(nr_pages, sizeof(struct page *), GFP_ATOMIC); + + return src->_dst_pp ? 0 : -ENOMEM; +} + +static bool try_to_discard(struct folio *src, int i) +{ + int usage; + void *addr; + struct page *page = folio_page(src, i); + + if (!folio_test_anon(src)) + return false; + + if (folio_test_swapcache(src)) + return false; + + usage = src_page_usage(page); + if (usage & SRC_PAGE_MLOCKED) + return false; + + if (!(usage & SRC_PAGE_MAPPED)) + return true; + + addr = kmap_local_page(page); + if (!memchr_inv(addr, 0, PAGE_SIZE)) + set_src_usage(page, SRC_PAGE_CLEAN); + kunmap_local(addr); + + return can_discard_src(page); +} + +static int prep_dst_pages(struct folio *src) +{ + int i; + int nr_pages = folio_nr_pages(src); + + if (folio_can_split(src)) + return 0; + + if (WARN_ON_ONCE(!src->_dst_pp)) + return -ENOMEM; + + for (i = 0; i < nr_pages; i++) { + struct page *dst = NULL; + + if (try_to_discard(src, i)) { + count_vm_event(THP_SHATTER_PAGE_DISCARDED); + continue; + } + + do { + int nid = folio_nid(src); + gfp_t gfp = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | + GFP_NOWAIT | __GFP_THISNODE; + + if (dst) + __free_page(dst); + + dst = alloc_pages_node(nid, gfp, 0); + if (!dst) + return -ENOMEM; + } while (!page_ref_freeze(dst, 1)); + + copy_highpage(dst, folio_page(src, i)); + src->_dst_ul[i] |= (unsigned long)dst; + + cond_resched(); + } + + return 0; +} + +static void free_dst_pages(struct folio *src) +{ + int i; + int nr_pages = folio_nr_pages(src); + + if (folio_can_split(src)) + return; + + for (i = 0; i < nr_pages; i++) { + struct page *dst = folio_dst_page(src, i); + + if (!dst) + continue; + + page_ref_unfreeze(dst, 1); + __free_page(dst); + } + + kfree(src->_dst_pp); + src->_dst_pp = NULL; +} + +static void reset_src_folio(struct folio *src) +{ + if (folio_can_split(src)) + return; + + if (WARN_ON_ONCE(!src->_dst_pp)) + return; + + if (!folio_mapping_flags(src)) + src->mapping = NULL; + + if (folio_test_anon(src) && folio_test_swapcache(src)) { + folio_clear_swapcache(src); + src->swap.val = 0; + } + + kfree(src->_dst_pp); + src->_dst_pp = NULL; +} + +static bool lru_add_dst(struct lruvec *lruvec, struct folio *src, struct folio *dst) +{ + if (folio_can_split(src)) + return false; + + VM_WARN_ON_ONCE_FOLIO(!folio_test_lru(src), src); + VM_WARN_ON_ONCE_FOLIO(folio_test_lru(dst), dst); + VM_WARN_ON_ONCE_FOLIO(folio_lruvec(dst) != folio_lruvec(src), dst); + + if (!lru_gen_add_dst(lruvec, dst)) { + enum lru_list lru = folio_lru_list(dst); + int zone = folio_zonenum(dst); + int delta = folio_nr_pages(dst); + + if (folio_test_unevictable(dst)) + dst->mlock_count = 0; + else + list_add_tail(&dst->lru, &src->lru); + update_lru_size(lruvec, lru, zone, delta); + } + + folio_set_lru(dst); + + return true; +} + static void lru_add_page_tail(struct page *head, struct page *tail, struct lruvec *lruvec, struct list_head *list) { @@ -2793,7 +2947,7 @@ static void lru_add_page_tail(struct page *head, struct page *tail, VM_WARN_ON(PageLRU(head)); get_page(tail); list_add_tail(&tail->lru, list); - } else { + } else if (!lru_add_dst(lruvec, page_folio(head), page_folio(tail))) { /* head is still on lru (and we have it frozen) */ VM_WARN_ON(!PageLRU(head)); if (PageUnevictable(tail)) @@ -2808,7 +2962,7 @@ static void __split_huge_page_tail(struct folio *folio, int tail, struct lruvec *lruvec, struct list_head *list) { struct page *head = &folio->page; - struct page *page_tail = head + tail; + struct page *page_tail = folio_dst_page(folio, tail); /* * Careful: new_folio is not a "real" folio before we cleared PageTail. * Don't pass it around before clear_compound_head(). @@ -2849,8 +3003,8 @@ static void __split_huge_page_tail(struct folio *folio, int tail, LRU_GEN_MASK | LRU_REFS_MASK)); /* ->mapping in first and second tail page is replaced by other uses */ - VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, - page_tail); + VM_BUG_ON_PAGE(folio_can_split(folio) && tail > 2 && + page_tail->mapping != TAIL_MAPPING, page_tail); page_tail->mapping = head->mapping; page_tail->index = head->index + tail; @@ -2905,9 +3059,13 @@ static void __split_huge_page(struct page *page, struct list_head *list, unsigned long offset = 0; unsigned int nr = thp_nr_pages(head); int i, nr_dropped = 0; + bool can_split = folio_can_split(folio); /* complete memcg works before add pages to LRU */ - split_page_memcg(head, nr); + if (can_split) + split_page_memcg(head, nr); + else + folio_copy_memcg(folio); if (folio_test_anon(folio) && folio_test_swapcache(folio)) { offset = swp_offset(folio->swap); @@ -2920,46 +3078,51 @@ static void __split_huge_page(struct page *page, struct list_head *list, ClearPageHasHWPoisoned(head); - for (i = nr - 1; i >= 1; i--) { + for (i = nr - 1; i >= can_split; i--) { + struct page *dst = folio_dst_page(folio, i); + + if (!dst) + continue; + __split_huge_page_tail(folio, i, lruvec, list); /* Some pages can be beyond EOF: drop them from page cache */ - if (head[i].index >= end) { - struct folio *tail = page_folio(head + i); + if (dst->index >= end) { + struct folio *tail = page_folio(dst); - if (shmem_mapping(head->mapping)) + if (shmem_mapping(tail->mapping)) nr_dropped++; else if (folio_test_clear_dirty(tail)) folio_account_cleaned(tail, - inode_to_wb(folio->mapping->host)); + inode_to_wb(tail->mapping->host)); __filemap_remove_folio(tail, NULL); folio_put(tail); - } else if (!PageAnon(page)) { - __xa_store(&head->mapping->i_pages, head[i].index, - head + i, 0); + } else if (!PageAnon(dst)) { + __xa_store(&dst->mapping->i_pages, dst->index, dst, 0); } else if (swap_cache) { - __xa_store(&swap_cache->i_pages, offset + i, - head + i, 0); + __xa_store(&swap_cache->i_pages, offset + i, dst, 0); } } - ClearPageCompound(head); + if (can_split) + ClearPageCompound(head); unlock_page_lruvec(lruvec); /* Caller disabled irqs, so they are still disabled here */ - split_page_owner(head, nr); + if (can_split) + split_page_owner(head, nr); /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { /* Additional pin to swap cache */ if (PageSwapCache(head)) { - page_ref_add(head, 2); + page_ref_add(head, 2 - !can_split); xa_unlock(&swap_cache->i_pages); } else { page_ref_inc(head); } } else { /* Additional pin to page cache */ - page_ref_add(head, 2); + page_ref_add(head, 2 - !can_split); xa_unlock(&head->mapping->i_pages); } local_irq_enable(); @@ -2969,8 +3132,9 @@ static void __split_huge_page(struct page *page, struct list_head *list, remap_page(folio, nr); for (i = 0; i < nr; i++) { - struct page *subpage = head + i; - if (subpage == page) + struct page *subpage = folio_dst_page(folio, i); + + if (!subpage || subpage == page) continue; unlock_page(subpage); @@ -2983,6 +3147,8 @@ static void __split_huge_page(struct page *page, struct list_head *list, */ free_page_and_swap_cache(subpage); } + + reset_src_folio(folio); } /* Racy check whether the huge page can be split */ @@ -2990,9 +3156,6 @@ static bool can_split_folio(struct folio *folio, int *pextra_pins) { int extra_pins; - if (!folio_can_split(folio)) - return false; - /* Additional pins from page cache */ if (folio_test_anon(folio)) extra_pins = folio_test_swapcache(folio) ? @@ -3112,8 +3275,21 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) goto out_unlock; } + ret = prep_to_unmap(folio); + if (ret) + goto out_unlock; + unmap_folio(folio); + if (!folio_ref_freeze(folio, 1 + extra_pins)) { + ret = -EAGAIN; + goto remap; + } + + ret = prep_dst_pages(folio); + if (ret) + goto unfreeze; + /* block interrupt reentry in xa_lock and spinlock */ local_irq_disable(); if (mapping) { @@ -3123,44 +3299,44 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) */ xas_lock(&xas); xas_reset(&xas); - if (xas_load(&xas) != folio) + if (xas_load(&xas) != folio) { + ret = -EAGAIN; goto fail; + } } /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock(&ds_queue->split_queue_lock); - if (folio_ref_freeze(folio, 1 + extra_pins)) { - if (!list_empty(&folio->_deferred_list)) { - ds_queue->split_queue_len--; - list_del(&folio->_deferred_list); - } - spin_unlock(&ds_queue->split_queue_lock); - if (mapping) { - int nr = folio_nr_pages(folio); + if (!list_empty(&folio->_deferred_list)) { + ds_queue->split_queue_len--; + list_del_init(&folio->_deferred_list); + } + spin_unlock(&ds_queue->split_queue_lock); + if (mapping) { + int nr = folio_nr_pages(folio); - xas_split(&xas, folio, folio_order(folio)); - if (folio_test_pmd_mappable(folio)) { - if (folio_test_swapbacked(folio)) { - __lruvec_stat_mod_folio(folio, - NR_SHMEM_THPS, -nr); - } else { - __lruvec_stat_mod_folio(folio, - NR_FILE_THPS, -nr); - filemap_nr_thps_dec(mapping); - } + xas_split(&xas, folio, folio_order(folio)); + if (folio_test_pmd_mappable(folio)) { + if (folio_test_swapbacked(folio)) { + __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); + } else { + __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); + filemap_nr_thps_dec(mapping); } } + } - __split_huge_page(page, list, end); - ret = 0; - } else { - spin_unlock(&ds_queue->split_queue_lock); + __split_huge_page(page, list, end); + if (ret) { fail: if (mapping) xas_unlock(&xas); local_irq_enable(); +unfreeze: + folio_ref_unfreeze(folio, 1 + extra_pins); +remap: + free_dst_pages(folio); remap_page(folio, folio_nr_pages(folio)); - ret = -EAGAIN; } out_unlock: @@ -3172,6 +3348,12 @@ out_unlock: i_mmap_unlock_read(mapping); out: xas_destroy(&xas); + + if (!folio_can_split(folio)) { + count_vm_event(!ret ? THP_SHATTER_PAGE : THP_SHATTER_PAGE_FAILED); + return ret ? : 1; + } + count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); return ret; } diff --git a/mm/internal.h b/mm/internal.h index 5389197209a3..2cf481d89907 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1416,4 +1416,43 @@ struct vma_prepare { struct vm_area_struct *remove; struct vm_area_struct *remove2; }; + +#define SRC_PAGE_MAPPED BIT(0) +#define SRC_PAGE_MLOCKED BIT(1) +#define SRC_PAGE_CLEAN BIT(2) +#define SRC_PAGE_USAGE_MASK (BIT(3) - 1) + +static inline unsigned long src_page_usage(struct page *page) +{ + struct folio *src = page_folio(page); + int i = folio_page_idx(src, page); + + if (folio_can_split(src) || !src->_dst_ul) + return 0; + + return src->_dst_ul[i] & SRC_PAGE_USAGE_MASK; +} + +static inline bool can_discard_src(struct page *page) +{ + return src_page_usage(page) & SRC_PAGE_CLEAN; +} + +static inline void set_src_usage(struct page *page, unsigned long usage) +{ + struct folio *src = page_folio(page); + int i = folio_page_idx(src, page); + + if (!folio_can_split(src) && src->_dst_ul) + src->_dst_ul[i] |= usage; +} + +static inline struct page *folio_dst_page(struct folio *src, int i) +{ + if (folio_can_split(src) || !src->_dst_ul) + return folio_page(src, i); + + return (void *)(src->_dst_ul[i] & ~SRC_PAGE_USAGE_MASK); +} + #endif /* __MM_INTERNAL_H */ diff --git a/mm/madvise.c b/mm/madvise.c index 0102183578ae..d32a3a8f16d7 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -418,7 +418,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, err = split_folio(folio); folio_unlock(folio); folio_put(folio); - if (!err) + if (err >= 0) goto regular_folio; return 0; } @@ -516,7 +516,7 @@ regular_folio: if (!start_pte) break; arch_enter_lazy_mmu_mode(); - if (!err) + if (err >= 0) nr = 0; continue; } @@ -749,7 +749,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (!start_pte) break; arch_enter_lazy_mmu_mode(); - if (!err) + if (err >= 0) nr = 0; continue; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7e9337b1ee3f..5515f4909ad8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3476,6 +3476,53 @@ void split_page_memcg(struct page *head, unsigned int nr) css_get_many(&memcg->css, nr - 1); } +void folio_copy_memcg(struct folio *src) +{ + int i; + unsigned long flags; + int delta = 0; + int nr_pages = folio_nr_pages(src); + struct mem_cgroup *memcg = folio_memcg(src); + + if (folio_can_split(src)) + return; + + if (WARN_ON_ONCE(!src->_dst_pp)) + return; + + if (mem_cgroup_disabled()) + return; + + if (WARN_ON_ONCE(!memcg)) + return; + + VM_WARN_ON_ONCE_FOLIO(!folio_test_large(src), src); + VM_WARN_ON_ONCE_FOLIO(folio_ref_count(src), src); + + for (i = 0; i < nr_pages; i++) { + struct page *dst = folio_dst_page(src, i); + + if (!dst) + continue; + + commit_charge(page_folio(dst), memcg); + delta++; + } + + if (!mem_cgroup_is_root(memcg)) { + page_counter_charge(&memcg->memory, delta); + if (do_memsw_account()) + page_counter_charge(&memcg->memsw, delta); + } + + css_get_many(&memcg->css, delta); + + local_irq_save(flags); + mem_cgroup_charge_statistics(memcg, delta); + memcg_check_events(memcg, folio_nid(src)); + local_irq_restore(flags); +} + #ifdef CONFIG_SWAP /** * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. diff --git a/mm/memory-failure.c b/mm/memory-failure.c index aed6517b8aa0..c6e2ce7b8237 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2286,7 +2286,7 @@ try_again: * page is a valid handlable page. */ SetPageHasHWPoisoned(hpage); - if (try_to_split_thp_page(p) < 0) { + if (try_to_split_thp_page(p)) { res = action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); goto unlock_mutex; } diff --git a/mm/migrate.c b/mm/migrate.c index 44516d933e7d..2de7dffe1b39 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -185,36 +185,52 @@ EXPORT_SYMBOL_GPL(putback_movable_pages); /* * Restore a potential migration pte to a working pte entry */ -static bool remove_migration_pte(struct folio *folio, - struct vm_area_struct *vma, unsigned long addr, void *old) +static bool remove_migration_pte(struct folio *dst, + struct vm_area_struct *vma, unsigned long addr, void *arg) { - DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION); + struct folio *src = arg; + DEFINE_FOLIO_VMA_WALK(pvmw, src, vma, addr, PVMW_SYNC | PVMW_MIGRATION); while (page_vma_mapped_walk(&pvmw)) { rmap_t rmap_flags = RMAP_NONE; pte_t old_pte; pte_t pte; swp_entry_t entry; - struct page *new; + struct page *page; + struct folio *folio; unsigned long idx = 0; /* pgoff is invalid for ksm pages, but they are never large */ - if (folio_test_large(folio) && !folio_test_hugetlb(folio)) + if (folio_test_large(dst) && !folio_test_hugetlb(dst)) idx = linear_page_index(vma, pvmw.address) - pvmw.pgoff; - new = folio_page(folio, idx); + page = folio_page(dst, idx); + + if (src == dst) { + if (can_discard_src(page)) { + VM_WARN_ON_ONCE_FOLIO(!folio_test_anon(src), src); + + pte_clear_not_present_full(pvmw.vma->vm_mm, pvmw.address, + pvmw.pte, false); + dec_mm_counter(pvmw.vma->vm_mm, MM_ANONPAGES); + continue; + } + page = folio_dst_page(src, idx); + } + + folio = page_folio(page); #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION /* PMD-mapped THP migration entry */ if (!pvmw.pte) { VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) || !folio_test_pmd_mappable(folio), folio); - remove_migration_pmd(&pvmw, new); + remove_migration_pmd(&pvmw, page); continue; } #endif folio_get(folio); - pte = mk_pte(new, READ_ONCE(vma->vm_page_prot)); + pte = mk_pte(page, READ_ONCE(vma->vm_page_prot)); old_pte = ptep_get(pvmw.pte); if (pte_swp_soft_dirty(old_pte)) pte = pte_mksoft_dirty(pte); @@ -232,13 +248,13 @@ static bool remove_migration_pte(struct folio *folio, if (folio_test_anon(folio) && !is_readable_migration_entry(entry)) rmap_flags |= RMAP_EXCLUSIVE; - if (unlikely(is_device_private_page(new))) { + if (unlikely(is_device_private_page(page))) { if (pte_write(pte)) entry = make_writable_device_private_entry( - page_to_pfn(new)); + page_to_pfn(page)); else entry = make_readable_device_private_entry( - page_to_pfn(new)); + page_to_pfn(page)); pte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(old_pte)) pte = pte_swp_mksoft_dirty(pte); @@ -264,17 +280,17 @@ static bool remove_migration_pte(struct folio *folio, #endif { if (folio_test_anon(folio)) - folio_add_anon_rmap_pte(folio, new, vma, + folio_add_anon_rmap_pte(folio, page, vma, pvmw.address, rmap_flags); else - folio_add_file_rmap_pte(folio, new, vma); + folio_add_file_rmap_pte(folio, page, vma); set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } if (vma->vm_flags & VM_LOCKED) mlock_drain_local(); trace_remove_migration_pte(pvmw.address, pte_val(pte), - compound_order(new)); + compound_order(page)); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, pvmw.address, pvmw.pte); @@ -1482,10 +1498,30 @@ out: return rc; } -static inline int try_split_folio(struct folio *folio, struct list_head *split_folios) +static inline int try_split_folio(struct folio *folio, struct list_head *split_folios, + int reason) { int rc; + if (!folio_can_split(folio)) { + LIST_HEAD(head); + + if (reason != MR_CONTIG_RANGE) + return -EBUSY; + + folio_lock(folio); + rc = split_folio_to_list(folio, &head); + folio_unlock(folio); + + if (rc > 0) { + putback_movable_pages(&head); + return 0; + } + + VM_WARN_ON_ONCE_FOLIO(!rc, folio); + return rc; + } + folio_lock(folio); rc = split_folio_to_list(folio, split_folios); folio_unlock(folio); @@ -1665,7 +1701,7 @@ static int migrate_pages_batch(struct list_head *from, if (!thp_migration_supported() && is_thp) { nr_failed++; stats->nr_thp_failed++; - if (!try_split_folio(folio, split_folios)) { + if (!try_split_folio(folio, split_folios, reason)) { stats->nr_thp_split++; continue; } @@ -1696,7 +1732,7 @@ static int migrate_pages_batch(struct list_head *from, stats->nr_thp_failed += is_thp; /* Large folio NUMA faulting doesn't split to retry. */ if (folio_test_large(folio) && !nosplit) { - int ret = try_split_folio(folio, split_folios); + int ret = try_split_folio(folio, split_folios, reason); if (!ret) { stats->nr_thp_split += is_thp; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eca9cb56df4c..120a317d0938 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1080,6 +1080,10 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page) bad_page(page, "nonzero pincount"); goto out; } + if (unlikely(folio->_private_1)) { + bad_page(page, "nonzero _private_1"); + goto out; + } break; case 2: /* diff --git a/mm/rmap.c b/mm/rmap.c index 79a0d1917099..9f76e1a17f26 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2274,6 +2274,10 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, hsz); else set_pte_at(mm, address, pvmw.pte, swp_pte); + if (vma->vm_flags & VM_LOCKED) + set_src_usage(subpage, SRC_PAGE_MLOCKED); + else + set_src_usage(subpage, SRC_PAGE_MAPPED); trace_set_migration_pte(address, pte_val(swp_pte), compound_order(&folio->page)); /* diff --git a/mm/shmem.c b/mm/shmem.c index fb2e23434509..12d4490a7e76 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -701,7 +701,7 @@ next: folio_put(folio); /* If split failed move the inode on the list back to shrinklist */ - if (ret) + if (ret < 0) goto move_back; split++; @@ -1469,7 +1469,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (folio_test_large(folio)) { /* Ensure the subpages are still dirty */ folio_test_set_dirty(folio); - if (split_huge_page(page) < 0) + if (split_huge_page(page)) goto redirty; folio = page_folio(page); folio_clear_dirty(folio); diff --git a/mm/truncate.c b/mm/truncate.c index 21dc202519c2..b7671d151d06 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -210,6 +210,7 @@ int truncate_inode_folio(struct address_space *mapping, struct folio *folio) */ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) { + int err; loff_t pos = folio_pos(folio); unsigned int offset, length; @@ -241,8 +242,11 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) folio_invalidate(folio, offset, length); if (!folio_test_large(folio)) return true; - if (split_folio(folio) == 0) + err = split_folio(folio); + if (!err) return true; + if (err > 0) + return false; if (folio_test_dirty(folio)) return false; truncate_inode_folio(folio->mapping, folio); diff --git a/mm/vmscan.c b/mm/vmscan.c index e610baa18413..2d8caa7c5c84 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1950,6 +1950,15 @@ retry: goto keep_locked; } + if (folio_ref_count(folio) == 1) { + folio_unlock(folio); + if (folio_put_testzero(folio)) + goto free_it; + + nr_reclaimed += nr_pages; + continue; + } + /* * If the folio was split above, the tail pages will make * their own pass through this function and be accounted diff --git a/mm/vmstat.c b/mm/vmstat.c index 4bcb6ec8a5b9..e49e774296b2 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1358,6 +1358,9 @@ const char * const vmstat_text[] = { "thp_split_page_failed", "thp_deferred_split_page", "thp_split_pmd", + "thp_shatter_page", + "thp_shatter_page_failed", + "thp_shatter_page_discarded", "thp_scan_exceed_none_pte", "thp_scan_exceed_swap_pte", "thp_scan_exceed_share_pte",