From c1ff6dcf209e4abc23584d2cd117f725421bccac Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Wed, 31 Jan 2024 00:12:09 -0700
Subject: [PATCH] FROMLIST: BACKPORT: THP zones: the use cases of policy zones

There are three types of zones:
1. The first four zones partition the physical address space of CPU
   memory.
2. The device zone provides interoperability between CPU and device
   memory.
3. The movable zone commonly represents a memory allocation policy.

Though originally designed for memory hot removal, the movable zone is
instead widely used for other purposes, e.g., CMA and kdump kernel, on
platforms that do not support hot removal, e.g., Android and ChromeOS.
Nowadays, it is legitimately a zone independent of any physical
characteristics. In spite of being somewhat regarded as a hack,
largely due to the lack of a generic design concept for its true major
use cases (on billions of client devices), the movable zone naturally
resembles a policy (virtual) zone overlayed on the first four
(physical) zones.

This proposal formally generalizes this concept as policy zones so
that additional policies can be implemented and enforced by subsequent
zones after the movable zone. An inherited requirement of policy zones
(and the first four zones) is that subsequent zones must be able to
fall back to previous zones and therefore must add new properties to
the previous zones rather than remove existing ones from them. Also,
all properties must be known at the allocation time, rather than the
runtime, e.g., memory object size and mobility are valid properties
but hotness and lifetime are not.

ZONE_MOVABLE becomes the first policy zone, followed by two new policy
zones:
1. ZONE_NOSPLIT, which contains pages that are movable (inherited from
   ZONE_MOVABLE) and restricted to a minimum order to be
   anti-fragmentation. The latter means that they cannot be split down
   below that order, while they are free or in use.
2. ZONE_NOMERGE, which contains pages that are movable and restricted
   to an exact order. The latter means that not only is split
   prohibited (inherited from ZONE_NOSPLIT) but also merge (see the
   reason in Chapter Three), while they are free or in use.

Since these two zones only can serve THP allocations (__GFP_MOVABLE |
__GFP_COMP), they are called THP zones. Reclaim works seamlessly and
compaction is not needed for these two zones.

Compared with the hugeTLB pool approach, THP zones tap into core MM
features including:
1. THP allocations can fall back to the lower zones, which can have
   higher latency but still succeed.
2. THPs can be either shattered (see Chapter Two) if partially
   unmapped or reclaimed if becoming cold.
3. THP orders can be much smaller than the PMD/PUD orders, e.g., 64KB
   contiguous PTEs on arm64 [1], which are more suitable for client
   workloads.

Policy zones can be dynamically resized by offlining pages in one of
them and onlining those pages in another of them. Note that this is
only done among policy zones, not between a policy zone and a physical
zone, since resizing is a (software) policy, not a physical
characteristic.

Implementing the same idea in the pageblock granularity has also been
explored but rejected at Google. Pageblocks have a finer granularity
and therefore can be more flexible than zones. The tradeoff is that
this alternative implementation was more complex and failed to bring a
better ROI. However, the rejection was mainly due to its inability to
be smoothly extended to 1GB THPs [2], which is a planned use case of
TAO.

[1] https://lore.kernel.org/20240215103205.2607016-1-ryan.roberts@arm.com/
[2] https://lore.kernel.org/20200928175428.4110504-1-zi.yan@sent.com/

Change-Id: I7eb555541d04b16b93dea5aa0e2b329c49694a10
Signed-off-by: Yu Zhao <yuzhao@google.com>
Link: https://lore.kernel.org/r/20240229183436.4110845-2-yuzhao@google.com/
Bug: 313807618
[ Don't allocate order 0 from nomerge/nosplit zone - causes increase
  in reclaim activity ]
Signed-off-by: Kalesh Singh <kaleshsingh@google.com>
---
 .../admin-guide/kernel-parameters.txt         |  10 +
 drivers/virtio/virtio_mem.c                   |   2 +-
 include/linux/gfp.h                           |  27 +-
 include/linux/huge_mm.h                       |   6 -
 include/linux/mempolicy.h                     |   2 +-
 include/linux/mmzone.h                        |  52 +-
 include/linux/nodemask.h                      |   2 +-
 include/linux/vm_event_item.h                 |   2 +-
 include/trace/events/mmflags.h                |   4 +-
 mm/compaction.c                               |  12 +
 mm/huge_memory.c                              |   5 +-
 mm/mempolicy.c                                |  14 +-
 mm/migrate.c                                  |   4 +-
 mm/mm_init.c                                  | 482 +++++++++++-------
 mm/page_alloc.c                               |  47 +-
 mm/page_isolation.c                           |   2 +-
 mm/vmscan.c                                   |  29 +-
 mm/vmstat.c                                   |   7 +-
 18 files changed, 456 insertions(+), 253 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 0e7ac433fcf3..979d128836b2 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3500,6 +3500,16 @@
 			allocations which rules out almost all kernel
 			allocations. Use with caution!
 
+	nosplit=X,Y	[MM] Set the minimum order of the nosplit zone. Pages in
+			this zone can't be split down below order Y, while free
+			or in use.
+			Like movablecore, X should be either nn[KMGTPE] or n%.
+
+	nomerge=X,Y	[MM] Set the exact orders of the nomerge zone. Pages in
+			this zone are always order Y, meaning they can't be
+			split or merged while free or in use.
+			Like movablecore, X should be either nn[KMGTPE] or n%.
+
 	MTD_Partition=	[MTD]
 			Format: <name>,<region-number>,<size>,<offset>
 
diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index fa5226c198cc..414a08c577ee 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -2228,7 +2228,7 @@ static bool virtio_mem_bbm_bb_is_movable(struct virtio_mem *vm,
 		page = pfn_to_online_page(pfn);
 		if (!page)
 			continue;
-		if (page_zonenum(page) != ZONE_MOVABLE)
+		if (!is_zone_movable_page(page))
 			return false;
 	}
 
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index dd8d20d1b8ac..54b655dfd859 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -85,8 +85,8 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
  * GFP_ZONES_SHIFT must be <= 2 on 32 bit platforms.
  */
 
-#if defined(CONFIG_ZONE_DEVICE) && (MAX_NR_ZONES-1) <= 4
-/* ZONE_DEVICE is not a valid GFP zone specifier */
+#if MAX_NR_ZONES - 2 - IS_ENABLED(CONFIG_ZONE_DEVICE) <= 4
+/* zones beyond ZONE_MOVABLE are not valid GFP zone specifiers */
 #define GFP_ZONES_SHIFT 2
 #else
 #define GFP_ZONES_SHIFT ZONES_SHIFT
@@ -124,6 +124,8 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
 	| 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA | ___GFP_HIGHMEM)  \
 )
 
+DECLARE_STATIC_KEY_FALSE(movablecore_enabled);
+
 static inline enum zone_type __gfp_zone(gfp_t flags)
 {
 	enum zone_type z;
@@ -132,9 +134,30 @@ static inline enum zone_type __gfp_zone(gfp_t flags)
 	z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) &
 					 ((1 << GFP_ZONES_SHIFT) - 1);
 	VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
+
+	if ((flags & __GFP_COMP) &&
+	    (!static_branch_unlikely(&movablecore_enabled) || (flags & __GFP_MOVABLE)))
+		return LAST_VIRT_ZONE;
+
 	return z;
 }
 
+extern int zone_nomerge_order __read_mostly;
+extern int zone_nosplit_order __read_mostly;
+
+static inline enum zone_type gfp_order_zone(gfp_t flags, int order)
+{
+	enum zone_type zid = __gfp_zone(flags);
+
+	if (zid >= ZONE_NOMERGE && (!zone_nomerge_order || order != zone_nomerge_order))
+		zid = ZONE_NOMERGE - 1;
+
+	if (zid == ZONE_NOSPLIT && (!zone_nosplit_order || order < zone_nosplit_order))
+		zid = ZONE_NOSPLIT - 1;
+
+	return zid;
+}
+
 enum zone_type gfp_zone(gfp_t flags);
 
 /*
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 2d3619049e03..c4de5fb1683b 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -294,7 +294,6 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
 		unsigned long len, unsigned long pgoff, unsigned long flags);
 
 void folio_prep_large_rmappable(struct folio *folio);
-bool can_split_folio(struct folio *folio, int *pextra_pins);
 int split_huge_page_to_list(struct page *page, struct list_head *list);
 static inline int split_huge_page(struct page *page)
 {
@@ -446,11 +445,6 @@ static inline void folio_prep_large_rmappable(struct folio *folio) {}
 
 #define thp_get_unmapped_area	NULL
 
-static inline bool
-can_split_folio(struct folio *folio, int *pextra_pins)
-{
-	return false;
-}
 static inline int
 split_huge_page_to_list(struct page *page, struct list_head *list)
 {
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 6c2754d7bfed..f30b03045f72 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -157,7 +157,7 @@ extern enum zone_type policy_zone;
 
 static inline void check_highest_zone(enum zone_type k)
 {
-	if (k > policy_zone && k != ZONE_MOVABLE)
+	if (k > policy_zone && !zid_is_virt(k))
 		policy_zone = k;
 }
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fa88b7a81b2f..a5077cf27cd8 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -815,11 +815,15 @@ enum zone_type {
 	 * there can be false negatives).
 	 */
 	ZONE_MOVABLE,
+	ZONE_NOSPLIT,
+	ZONE_NOMERGE,
 #ifdef CONFIG_ZONE_DEVICE
 	ZONE_DEVICE,
 #endif
-	__MAX_NR_ZONES
+	__MAX_NR_ZONES,
 
+	LAST_PHYS_ZONE = ZONE_MOVABLE - 1,
+	LAST_VIRT_ZONE = ZONE_NOMERGE,
 };
 
 #ifndef __GENERATING_BOUNDS_H
@@ -938,6 +942,8 @@ struct zone {
 	seqlock_t		span_seqlock;
 #endif
 
+	int order;
+
 	int initialized;
 
 	/* Write-intensive fields used from the page allocator */
@@ -1155,12 +1161,22 @@ static inline bool folio_is_zone_device(const struct folio *folio)
 
 static inline bool is_zone_movable_page(const struct page *page)
 {
-	return page_zonenum(page) == ZONE_MOVABLE;
+	return page_zonenum(page) >= ZONE_MOVABLE;
 }
 
 static inline bool folio_is_zone_movable(const struct folio *folio)
 {
-	return folio_zonenum(folio) == ZONE_MOVABLE;
+	return folio_zonenum(folio) >= ZONE_MOVABLE;
+}
+
+static inline bool page_can_split(struct page *page)
+{
+	return page_zonenum(page) < ZONE_NOSPLIT;
+}
+
+static inline bool folio_can_split(struct folio *folio)
+{
+	return folio_zonenum(folio) < ZONE_NOSPLIT;
 }
 #endif
 
@@ -1480,6 +1496,32 @@ static inline int local_memory_node(int node_id) { return node_id; };
  */
 #define zone_idx(zone)		((zone) - (zone)->zone_pgdat->node_zones)
 
+static inline bool zid_is_virt(enum zone_type zid)
+{
+	return zid > LAST_PHYS_ZONE && zid <= LAST_VIRT_ZONE;
+}
+
+static inline bool zone_can_frag(struct zone *zone)
+{
+	VM_WARN_ON_ONCE(zone->order && zone_idx(zone) < ZONE_NOSPLIT);
+
+	return zone_idx(zone) < ZONE_NOSPLIT;
+}
+
+static inline bool zone_is_suitable(struct zone *zone, int order)
+{
+	int zid = zone_idx(zone);
+
+	if (zid < ZONE_NOSPLIT)
+		return true;
+
+	if (!zone->order)
+		return false;
+
+	return (zid == ZONE_NOSPLIT && order >= zone->order) ||
+	       (zid == ZONE_NOMERGE && order == zone->order);
+}
+
 #ifdef CONFIG_ZONE_DEVICE
 static inline bool zone_is_zone_device(struct zone *zone)
 {
@@ -1528,13 +1570,13 @@ static inline int zone_to_nid(struct zone *zone)
 static inline void zone_set_nid(struct zone *zone, int nid) {}
 #endif
 
-extern int movable_zone;
+extern int virt_zone;
 
 static inline int is_highmem_idx(enum zone_type idx)
 {
 #ifdef CONFIG_HIGHMEM
 	return (idx == ZONE_HIGHMEM ||
-		(idx == ZONE_MOVABLE && movable_zone == ZONE_HIGHMEM));
+		(zid_is_virt(idx) && virt_zone == ZONE_HIGHMEM));
 #else
 	return 0;
 #endif
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 8d07116caaf1..e16c879ca63d 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -404,7 +404,7 @@ enum node_states {
 #else
 	N_HIGH_MEMORY = N_NORMAL_MEMORY,
 #endif
-	N_MEMORY,		/* The node has memory(regular, high, movable) */
+	N_MEMORY,		/* The node has memory in any of the zones */
 	N_CPU,		/* The node has one or more cpus */
 	N_GENERIC_INITIATOR,	/* The node has one or more Generic Initiators */
 	NR_NODE_STATES
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 8abfa1240040..86a33075dfd5 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -27,7 +27,7 @@
 #endif
 
 #define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, \
-	HIGHMEM_ZONE(xx) xx##_MOVABLE, DEVICE_ZONE(xx)
+	HIGHMEM_ZONE(xx) xx##_MOVABLE, xx##_NOSPLIT, xx##_NOMERGE, DEVICE_ZONE(xx)
 
 enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		FOR_ALL_ZONES(PGALLOC)
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 923254273b10..36fdb175e0e8 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -276,7 +276,9 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY,	"softdirty"	)		\
 	IFDEF_ZONE_DMA32(	EM (ZONE_DMA32,	 "DMA32"))	\
 				EM (ZONE_NORMAL, "Normal")	\
 	IFDEF_ZONE_HIGHMEM(	EM (ZONE_HIGHMEM,"HighMem"))	\
-				EMe(ZONE_MOVABLE,"Movable")
+				EM (ZONE_MOVABLE,"Movable")	\
+				EM (ZONE_NOSPLIT,"NoSplit")	\
+				EMe(ZONE_NOMERGE,"NoMerge")
 
 #define LRU_NAMES		\
 		EM (LRU_INACTIVE_ANON, "inactive_anon") \
diff --git a/mm/compaction.c b/mm/compaction.c
index a8d7e51a6c15..7c3837779532 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -2725,6 +2725,9 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
 					ac->highest_zoneidx, ac->nodemask) {
 		enum compact_result status;
 
+		if (!zone_can_frag(zone))
+			continue;
+
 		if (prio > MIN_COMPACT_PRIORITY
 					&& compaction_deferred(zone, order)) {
 			rc = max_t(enum compact_result, COMPACT_DEFERRED, rc);
@@ -2797,6 +2800,9 @@ static void proactive_compact_node(pg_data_t *pgdat)
 		if (!populated_zone(zone))
 			continue;
 
+		if (!zone_can_frag(zone))
+			continue;
+
 		cc.zone = zone;
 
 		compact_zone(&cc, NULL);
@@ -2829,6 +2835,9 @@ static void compact_node(int nid)
 		if (!populated_zone(zone))
 			continue;
 
+		if (!zone_can_frag(zone))
+			continue;
+
 		cc.zone = zone;
 
 		compact_zone(&cc, NULL);
@@ -2942,6 +2951,9 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat)
 		if (!populated_zone(zone))
 			continue;
 
+		if (!zone_can_frag(zone))
+			continue;
+
 		/* Allocation can already succeed, check other zones */
 		if (zone_watermark_ok(zone, pgdat->kcompactd_max_order,
 				      min_wmark_pages(zone),
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e2207be91313..dffa327edbcd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2986,10 +2986,13 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 }
 
 /* Racy check whether the huge page can be split */
-bool can_split_folio(struct folio *folio, int *pextra_pins)
+static bool can_split_folio(struct folio *folio, int *pextra_pins)
 {
 	int extra_pins;
 
+	if (!folio_can_split(folio))
+		return false;
+
 	/* Additional pins from page cache */
 	if (folio_test_anon(folio))
 		extra_pins = folio_test_swapcache(folio) ?
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 85825ff9b94c..8db5ac8d856e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1836,22 +1836,20 @@ bool vma_policy_mof(struct vm_area_struct *vma)
 
 bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
 {
-	enum zone_type dynamic_policy_zone = policy_zone;
-
-	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
+	WARN_ON_ONCE(zid_is_virt(policy_zone));
 
 	/*
-	 * if policy->nodes has movable memory only,
-	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
+	 * If policy->nodes has memory in virtual zones only, we apply policy
+	 * only if gfp_zone(gfp) can allocate from those zones.
 	 *
 	 * policy->nodes is intersect with node_states[N_MEMORY].
 	 * so if the following test fails, it implies
-	 * policy->nodes has movable memory only.
+	 * policy->nodes has memory in virtual zones only.
 	 */
 	if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
-		dynamic_policy_zone = ZONE_MOVABLE;
+		return zone > LAST_PHYS_ZONE;
 
-	return zone >= dynamic_policy_zone;
+	return zone >= policy_zone;
 }
 
 /*
diff --git a/mm/migrate.c b/mm/migrate.c
index 3dd2776750c0..44516d933e7d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2028,7 +2028,7 @@ struct folio *alloc_migration_target(struct folio *src, unsigned long private)
 		order = folio_order(src);
 	}
 	zidx = zone_idx(folio_zone(src));
-	if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE)
+	if (zidx > ZONE_NORMAL)
 		gfp_mask |= __GFP_HIGHMEM;
 
 	return __folio_alloc(gfp_mask, order, nid, mtc->nmask);
@@ -2525,7 +2525,7 @@ static int numamigrate_isolate_folio(pg_data_t *pgdat, struct folio *folio)
 			return 0;
 
 		wakeup_kswapd(pgdat->node_zones + z, 0,
-			      folio_order(folio), ZONE_MOVABLE);
+			      folio_order(folio), z);
 		return 0;
 	}
 
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 77fd04c83d04..a0b3b08f97b6 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -217,12 +217,18 @@ postcore_initcall(mm_sysfs_init);
 
 static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
 static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
-static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
 
-static unsigned long required_kernelcore __initdata;
-static unsigned long required_kernelcore_percent __initdata;
-static unsigned long required_movablecore __initdata;
-static unsigned long required_movablecore_percent __initdata;
+static unsigned long virt_zones[LAST_VIRT_ZONE - LAST_PHYS_ZONE][MAX_NUMNODES] __initdata;
+#define pfn_of(zid, nid) (virt_zones[(zid) - LAST_PHYS_ZONE - 1][nid])
+
+static unsigned long zone_nr_pages[LAST_VIRT_ZONE - LAST_PHYS_ZONE + 1] __initdata;
+#define nr_pages_of(zid) (zone_nr_pages[(zid) - LAST_PHYS_ZONE])
+
+static unsigned long zone_percentage[LAST_VIRT_ZONE - LAST_PHYS_ZONE + 1] __initdata;
+#define percentage_of(zid) (zone_percentage[(zid) - LAST_PHYS_ZONE])
+
+int zone_nosplit_order __read_mostly;
+int zone_nomerge_order __read_mostly;
 
 static unsigned long nr_kernel_pages __initdata;
 static unsigned long nr_all_pages __initdata;
@@ -273,25 +279,71 @@ static int __init cmdline_parse_kernelcore(char *p)
 		return 0;
 	}
 
-	return cmdline_parse_core(p, &required_kernelcore,
-				  &required_kernelcore_percent);
+	return cmdline_parse_core(p, &nr_pages_of(LAST_PHYS_ZONE),
+				  &percentage_of(LAST_PHYS_ZONE));
 }
 early_param("kernelcore", cmdline_parse_kernelcore);
 
+DEFINE_STATIC_KEY_FALSE(movablecore_enabled);
+
 /*
  * movablecore=size sets the amount of memory for use for allocations that
  * can be reclaimed or migrated.
  */
 static int __init cmdline_parse_movablecore(char *p)
 {
-	return cmdline_parse_core(p, &required_movablecore,
-				  &required_movablecore_percent);
+	static_branch_enable(&movablecore_enabled);
+
+	return cmdline_parse_core(p, &nr_pages_of(ZONE_MOVABLE),
+				  &percentage_of(ZONE_MOVABLE));
 }
 early_param("movablecore", cmdline_parse_movablecore);
 
+static int __init parse_zone_order(char *p, unsigned long *nr_pages,
+				   unsigned long *percent, int *order)
+{
+	int err;
+	unsigned long n;
+	char *s = strchr(p, ',');
+
+	if (!s)
+		return -EINVAL;
+
+	*s++ = '\0';
+
+	err = kstrtoul(s, 0, &n);
+	if (err)
+		return err;
+
+	if (n < 2 || n > MAX_ORDER)
+		return -EINVAL;
+
+	err = cmdline_parse_core(p, nr_pages, percent);
+	if (err)
+		return err;
+
+	*order = n;
+
+	return 0;
+}
+
+static int __init parse_zone_nosplit(char *p)
+{
+	return parse_zone_order(p, &nr_pages_of(ZONE_NOSPLIT),
+				&percentage_of(ZONE_NOSPLIT), &zone_nosplit_order);
+}
+early_param("nosplit", parse_zone_nosplit);
+
+static int __init parse_zone_nomerge(char *p)
+{
+	return parse_zone_order(p, &nr_pages_of(ZONE_NOMERGE),
+				&percentage_of(ZONE_NOMERGE), &zone_nomerge_order);
+}
+early_param("nomerge", parse_zone_nomerge);
+
 /*
  * early_calculate_totalpages()
- * Sum pages in active regions for movable zone.
+ * Sum pages in active regions for virtual zones.
  * Populate N_MEMORY for calculating usable_nodes.
  */
 static unsigned long __init early_calculate_totalpages(void)
@@ -311,24 +363,110 @@ static unsigned long __init early_calculate_totalpages(void)
 }
 
 /*
- * This finds a zone that can be used for ZONE_MOVABLE pages. The
+ * This finds a physical zone that can be used for virtual zones. The
  * assumption is made that zones within a node are ordered in monotonic
  * increasing memory addresses so that the "highest" populated zone is used
  */
-static void __init find_usable_zone_for_movable(void)
+static void __init find_usable_zone(void)
 {
 	int zone_index;
-	for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
-		if (zone_index == ZONE_MOVABLE)
-			continue;
-
+	for (zone_index = LAST_PHYS_ZONE; zone_index >= 0; zone_index--) {
 		if (arch_zone_highest_possible_pfn[zone_index] >
 				arch_zone_lowest_possible_pfn[zone_index])
 			break;
 	}
 
 	VM_BUG_ON(zone_index == -1);
-	movable_zone = zone_index;
+	virt_zone = zone_index;
+}
+
+static void __init find_virt_zone(unsigned long occupied, unsigned long *zone_pfn)
+{
+	int i, nid;
+	unsigned long node_avg, remaining;
+	int usable_nodes = nodes_weight(node_states[N_MEMORY]);
+	/* usable_startpfn is the lowest possible pfn virtual zones can be at */
+	unsigned long usable_startpfn = arch_zone_lowest_possible_pfn[virt_zone];
+
+restart:
+	/* Carve out memory as evenly as possible throughout nodes */
+	node_avg = occupied / usable_nodes;
+	for_each_node_state(nid, N_MEMORY) {
+		unsigned long start_pfn, end_pfn;
+
+		/*
+		 * Recalculate node_avg if the division per node now exceeds
+		 * what is necessary to satisfy the amount of memory to carve
+		 * out.
+		 */
+		if (occupied < node_avg)
+			node_avg = occupied / usable_nodes;
+
+		/*
+		 * As the map is walked, we track how much memory is usable
+		 * using remaining. When it is 0, the rest of the node is
+		 * usable.
+		 */
+		remaining = node_avg;
+
+		/* Go through each range of PFNs within this node */
+		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
+			unsigned long size_pages;
+
+			start_pfn = max(start_pfn, zone_pfn[nid]);
+			if (start_pfn >= end_pfn)
+				continue;
+
+			/* Account for what is only usable when carving out */
+			if (start_pfn < usable_startpfn) {
+				unsigned long nr_pages = min(end_pfn, usable_startpfn) - start_pfn;
+
+				remaining -= min(nr_pages, remaining);
+				occupied -= min(nr_pages, occupied);
+
+				/* Continue if range is now fully accounted */
+				if (end_pfn <= usable_startpfn) {
+
+					/*
+					 * Push zone_pfn to the end so that if
+					 * we have to carve out more across
+					 * nodes, we will not double account
+					 * here.
+					 */
+					zone_pfn[nid] = end_pfn;
+					continue;
+				}
+				start_pfn = usable_startpfn;
+			}
+
+			/*
+			 * The usable PFN range is from start_pfn->end_pfn.
+			 * Calculate size_pages as the number of pages used.
+			 */
+			size_pages = end_pfn - start_pfn;
+			if (size_pages > remaining)
+				size_pages = remaining;
+			zone_pfn[nid] = start_pfn + size_pages;
+
+			/*
+			 * Some memory was carved out, update counts and break
+			 * if the request for this node has been satisfied.
+			 */
+			occupied -= min(occupied, size_pages);
+			remaining -= size_pages;
+			if (!remaining)
+				break;
+		}
+	}
+
+	/*
+	 * If there is still more to carve out, we do another pass with one less
+	 * node in the count. This will push zone_pfn[nid] further along on the
+	 * nodes that still have memory until the request is fully satisfied.
+	 */
+	usable_nodes--;
+	if (usable_nodes && occupied > usable_nodes)
+		goto restart;
 }
 
 /*
@@ -337,19 +475,19 @@ static void __init find_usable_zone_for_movable(void)
  * memory. When they don't, some nodes will have more kernelcore than
  * others
  */
-static void __init find_zone_movable_pfns_for_nodes(void)
+static void __init find_virt_zones(void)
 {
-	int i, nid;
+	int i;
+	int nid;
 	unsigned long usable_startpfn;
-	unsigned long kernelcore_node, kernelcore_remaining;
 	/* save the state before borrow the nodemask */
 	nodemask_t saved_node_state = node_states[N_MEMORY];
 	unsigned long totalpages = early_calculate_totalpages();
-	int usable_nodes = nodes_weight(node_states[N_MEMORY]);
 	struct memblock_region *r;
+	unsigned long occupied = 0;
 
-	/* Need to find movable_zone earlier when movable_node is specified. */
-	find_usable_zone_for_movable();
+	/* Need to find virt_zone earlier when movable_node is specified. */
+	find_usable_zone();
 
 	/*
 	 * If movable_node is specified, ignore kernelcore and movablecore
@@ -363,8 +501,8 @@ static void __init find_zone_movable_pfns_for_nodes(void)
 			nid = memblock_get_region_node(r);
 
 			usable_startpfn = PFN_DOWN(r->base);
-			zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
-				min(usable_startpfn, zone_movable_pfn[nid]) :
+			pfn_of(ZONE_MOVABLE, nid) = pfn_of(ZONE_MOVABLE, nid) ?
+				min(usable_startpfn, pfn_of(ZONE_MOVABLE, nid)) :
 				usable_startpfn;
 		}
 
@@ -400,8 +538,8 @@ static void __init find_zone_movable_pfns_for_nodes(void)
 				continue;
 			}
 
-			zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
-				min(usable_startpfn, zone_movable_pfn[nid]) :
+			pfn_of(ZONE_MOVABLE, nid) = pfn_of(ZONE_MOVABLE, nid) ?
+				min(usable_startpfn, pfn_of(ZONE_MOVABLE, nid)) :
 				usable_startpfn;
 		}
 
@@ -411,151 +549,92 @@ static void __init find_zone_movable_pfns_for_nodes(void)
 		goto out2;
 	}
 
+	if (zone_nomerge_order > pageblock_order) {
+		nr_pages_of(ZONE_NOMERGE) = 0;
+		percentage_of(ZONE_NOMERGE) = 0;
+		zone_nomerge_order = 0;
+		pr_warn("zone %s order %d cannot be higher than pageblock order %d\n",
+			zone_names[ZONE_NOMERGE], zone_nomerge_order, pageblock_order);
+	}
+
+	if (zone_nosplit_order > pageblock_order) {
+		nr_pages_of(ZONE_NOSPLIT) = 0;
+		percentage_of(ZONE_NOSPLIT) = 0;
+		zone_nosplit_order = 0;
+		pr_warn("zone %s order %d cannot be higher than pageblock order %d\n",
+			zone_names[ZONE_NOSPLIT], zone_nosplit_order, pageblock_order);
+	}
+
+	if (zone_nomerge_order && zone_nomerge_order <= zone_nosplit_order) {
+		nr_pages_of(ZONE_NOSPLIT) = nr_pages_of(ZONE_NOMERGE) = 0;
+		percentage_of(ZONE_NOSPLIT) = percentage_of(ZONE_NOMERGE) = 0;
+		zone_nosplit_order = zone_nomerge_order = 0;
+		pr_warn("zone %s order %d cannot be higher than zone %s order %d\n",
+			zone_names[ZONE_NOSPLIT], zone_nosplit_order,
+			zone_names[ZONE_NOMERGE], zone_nomerge_order);
+	}
+
 	/*
 	 * If kernelcore=nn% or movablecore=nn% was specified, calculate the
 	 * amount of necessary memory.
 	 */
-	if (required_kernelcore_percent)
-		required_kernelcore = (totalpages * 100 * required_kernelcore_percent) /
-				       10000UL;
-	if (required_movablecore_percent)
-		required_movablecore = (totalpages * 100 * required_movablecore_percent) /
-					10000UL;
+	for (i = LAST_PHYS_ZONE; i <= LAST_VIRT_ZONE; i++) {
+		if (percentage_of(i))
+			nr_pages_of(i) = totalpages * percentage_of(i) / 100;
+
+		nr_pages_of(i) = roundup(nr_pages_of(i), MAX_ORDER_NR_PAGES);
+		occupied += nr_pages_of(i);
+	}
 
 	/*
 	 * If movablecore= was specified, calculate what size of
 	 * kernelcore that corresponds so that memory usable for
 	 * any allocation type is evenly spread. If both kernelcore
 	 * and movablecore are specified, then the value of kernelcore
-	 * will be used for required_kernelcore if it's greater than
-	 * what movablecore would have allowed.
+	 * will be used if it's greater than what movablecore would have
+	 * allowed.
 	 */
-	if (required_movablecore) {
-		unsigned long corepages;
+	if (occupied < totalpages) {
+		enum zone_type zid;
 
-		/*
-		 * Round-up so that ZONE_MOVABLE is at least as large as what
-		 * was requested by the user
-		 */
-		required_movablecore =
-			roundup(required_movablecore, MAX_ORDER_NR_PAGES);
-		required_movablecore = min(totalpages, required_movablecore);
-		corepages = totalpages - required_movablecore;
-
-		required_kernelcore = max(required_kernelcore, corepages);
+		zid = !nr_pages_of(LAST_PHYS_ZONE) || nr_pages_of(ZONE_MOVABLE) ?
+		      LAST_PHYS_ZONE : ZONE_MOVABLE;
+		nr_pages_of(zid) += totalpages - occupied;
 	}
 
 	/*
 	 * If kernelcore was not specified or kernelcore size is larger
-	 * than totalpages, there is no ZONE_MOVABLE.
+	 * than totalpages, there are not virtual zones.
 	 */
-	if (!required_kernelcore || required_kernelcore >= totalpages)
+	occupied = nr_pages_of(LAST_PHYS_ZONE);
+	if (!occupied || occupied >= totalpages)
 		goto out;
 
-	/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
-	usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
+	for (i = LAST_PHYS_ZONE + 1; i <= LAST_VIRT_ZONE; i++) {
+		if (!nr_pages_of(i))
+			continue;
 
-restart:
-	/* Spread kernelcore memory as evenly as possible throughout nodes */
-	kernelcore_node = required_kernelcore / usable_nodes;
-	for_each_node_state(nid, N_MEMORY) {
-		unsigned long start_pfn, end_pfn;
-
-		/*
-		 * Recalculate kernelcore_node if the division per node
-		 * now exceeds what is necessary to satisfy the requested
-		 * amount of memory for the kernel
-		 */
-		if (required_kernelcore < kernelcore_node)
-			kernelcore_node = required_kernelcore / usable_nodes;
-
-		/*
-		 * As the map is walked, we track how much memory is usable
-		 * by the kernel using kernelcore_remaining. When it is
-		 * 0, the rest of the node is usable by ZONE_MOVABLE
-		 */
-		kernelcore_remaining = kernelcore_node;
-
-		/* Go through each range of PFNs within this node */
-		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
-			unsigned long size_pages;
-
-			start_pfn = max(start_pfn, zone_movable_pfn[nid]);
-			if (start_pfn >= end_pfn)
-				continue;
-
-			/* Account for what is only usable for kernelcore */
-			if (start_pfn < usable_startpfn) {
-				unsigned long kernel_pages;
-				kernel_pages = min(end_pfn, usable_startpfn)
-								- start_pfn;
-
-				kernelcore_remaining -= min(kernel_pages,
-							kernelcore_remaining);
-				required_kernelcore -= min(kernel_pages,
-							required_kernelcore);
-
-				/* Continue if range is now fully accounted */
-				if (end_pfn <= usable_startpfn) {
-
-					/*
-					 * Push zone_movable_pfn to the end so
-					 * that if we have to rebalance
-					 * kernelcore across nodes, we will
-					 * not double account here
-					 */
-					zone_movable_pfn[nid] = end_pfn;
-					continue;
-				}
-				start_pfn = usable_startpfn;
-			}
-
-			/*
-			 * The usable PFN range for ZONE_MOVABLE is from
-			 * start_pfn->end_pfn. Calculate size_pages as the
-			 * number of pages used as kernelcore
-			 */
-			size_pages = end_pfn - start_pfn;
-			if (size_pages > kernelcore_remaining)
-				size_pages = kernelcore_remaining;
-			zone_movable_pfn[nid] = start_pfn + size_pages;
-
-			/*
-			 * Some kernelcore has been met, update counts and
-			 * break if the kernelcore for this node has been
-			 * satisfied
-			 */
-			required_kernelcore -= min(required_kernelcore,
-								size_pages);
-			kernelcore_remaining -= size_pages;
-			if (!kernelcore_remaining)
-				break;
-		}
+		find_virt_zone(occupied, &pfn_of(i, 0));
+		occupied += nr_pages_of(i);
 	}
-
-	/*
-	 * If there is still required_kernelcore, we do another pass with one
-	 * less node in the count. This will push zone_movable_pfn[nid] further
-	 * along on the nodes that still have memory until kernelcore is
-	 * satisfied
-	 */
-	usable_nodes--;
-	if (usable_nodes && required_kernelcore > usable_nodes)
-		goto restart;
-
 out2:
-	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
+	/* Align starts of virtual zones on all nids to MAX_ORDER_NR_PAGES */
 	for (nid = 0; nid < MAX_NUMNODES; nid++) {
 		unsigned long start_pfn, end_pfn;
-
-		zone_movable_pfn[nid] =
-			roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
+		unsigned long prev_virt_zone_pfn = 0;
 
 		get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
-		if (zone_movable_pfn[nid] >= end_pfn)
-			zone_movable_pfn[nid] = 0;
-	}
 
+		for (i = LAST_PHYS_ZONE + 1; i <= LAST_VIRT_ZONE; i++) {
+			pfn_of(i, nid) = roundup(pfn_of(i, nid), MAX_ORDER_NR_PAGES);
+
+			if (pfn_of(i, nid) <= prev_virt_zone_pfn || pfn_of(i, nid) >= end_pfn)
+				pfn_of(i, nid) = 0;
+
+			if (pfn_of(i, nid))
+				prev_virt_zone_pfn = pfn_of(i, nid);
+		}
+	}
 out:
 	/* restore the node_state */
 	node_states[N_MEMORY] = saved_node_state;
@@ -1104,38 +1183,54 @@ void __ref memmap_init_zone_device(struct zone *zone,
 #endif
 
 /*
- * The zone ranges provided by the architecture do not include ZONE_MOVABLE
- * because it is sized independent of architecture. Unlike the other zones,
- * the starting point for ZONE_MOVABLE is not fixed. It may be different
- * in each node depending on the size of each node and how evenly kernelcore
- * is distributed. This helper function adjusts the zone ranges
+ * The zone ranges provided by the architecture do not include virtual zones
+ * because they are sized independent of architecture. Unlike physical zones,
+ * the starting point for the first populated virtual zone is not fixed. It may
+ * be different in each node depending on the size of each node and how evenly
+ * kernelcore is distributed. This helper function adjusts the zone ranges
  * provided by the architecture for a given node by using the end of the
- * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
- * zones within a node are in order of monotonic increases memory addresses
+ * highest usable zone for the first populated virtual zone. This preserves the
+ * assumption that zones within a node are in order of monotonic increases
+ * memory addresses.
  */
-static void __init adjust_zone_range_for_zone_movable(int nid,
+static void __init adjust_zone_range(int nid,
 					unsigned long zone_type,
 					unsigned long node_end_pfn,
 					unsigned long *zone_start_pfn,
 					unsigned long *zone_end_pfn)
 {
-	/* Only adjust if ZONE_MOVABLE is on this node */
-	if (zone_movable_pfn[nid]) {
-		/* Size ZONE_MOVABLE */
-		if (zone_type == ZONE_MOVABLE) {
-			*zone_start_pfn = zone_movable_pfn[nid];
-			*zone_end_pfn = min(node_end_pfn,
-				arch_zone_highest_possible_pfn[movable_zone]);
+	int i = max_t(int, zone_type, LAST_PHYS_ZONE);
+	unsigned long next_virt_zone_pfn = 0;
 
-		/* Adjust for ZONE_MOVABLE starting within this range */
-		} else if (!mirrored_kernelcore &&
-			*zone_start_pfn < zone_movable_pfn[nid] &&
-			*zone_end_pfn > zone_movable_pfn[nid]) {
-			*zone_end_pfn = zone_movable_pfn[nid];
+	while (i++ < LAST_VIRT_ZONE) {
+		if (pfn_of(i, nid)) {
+			next_virt_zone_pfn = pfn_of(i, nid);
+			break;
+		}
+	}
 
-		/* Check if this whole range is within ZONE_MOVABLE */
-		} else if (*zone_start_pfn >= zone_movable_pfn[nid])
+	if (zone_type <= LAST_PHYS_ZONE) {
+		if (!next_virt_zone_pfn)
+			return;
+
+		if (!mirrored_kernelcore &&
+		    *zone_start_pfn < next_virt_zone_pfn &&
+		    *zone_end_pfn > next_virt_zone_pfn)
+			*zone_end_pfn = next_virt_zone_pfn;
+		else if (*zone_start_pfn >= next_virt_zone_pfn)
 			*zone_start_pfn = *zone_end_pfn;
+	} else if (zone_type <= LAST_VIRT_ZONE) {
+		if (!pfn_of(zone_type, nid))
+			return;
+
+		if (next_virt_zone_pfn)
+			*zone_end_pfn = min3(next_virt_zone_pfn,
+					     node_end_pfn,
+					     arch_zone_highest_possible_pfn[virt_zone]);
+		else
+			*zone_end_pfn = min(node_end_pfn,
+					    arch_zone_highest_possible_pfn[virt_zone]);
+		*zone_start_pfn = min(*zone_end_pfn, pfn_of(zone_type, nid));
 	}
 }
 
@@ -1191,7 +1286,7 @@ static unsigned long __init zone_absent_pages_in_node(int nid,
 	 * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
 	 * and vice versa.
 	 */
-	if (mirrored_kernelcore && zone_movable_pfn[nid]) {
+	if (mirrored_kernelcore && pfn_of(ZONE_MOVABLE, nid)) {
 		unsigned long start_pfn, end_pfn;
 		struct memblock_region *r;
 
@@ -1231,8 +1326,7 @@ static unsigned long __init zone_spanned_pages_in_node(int nid,
 	/* Get the start and end of the zone */
 	*zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
 	*zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
-	adjust_zone_range_for_zone_movable(nid, zone_type, node_end_pfn,
-					   zone_start_pfn, zone_end_pfn);
+	adjust_zone_range(nid, zone_type, node_end_pfn, zone_start_pfn, zone_end_pfn);
 
 	/* Check that this node has pages within the zone's required range */
 	if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
@@ -1297,6 +1391,10 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat,
 #if defined(CONFIG_MEMORY_HOTPLUG)
 		zone->present_early_pages = real_size;
 #endif
+		if (i == ZONE_NOSPLIT)
+			zone->order = zone_nosplit_order;
+		if (i == ZONE_NOMERGE)
+			zone->order = zone_nomerge_order;
 
 		totalpages += spanned;
 		realtotalpages += real_size;
@@ -1748,7 +1846,7 @@ static void __init check_for_memory(pg_data_t *pgdat)
 {
 	enum zone_type zone_type;
 
-	for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
+	for (zone_type = 0; zone_type <= LAST_PHYS_ZONE; zone_type++) {
 		struct zone *zone = &pgdat->node_zones[zone_type];
 		if (populated_zone(zone)) {
 			if (IS_ENABLED(CONFIG_HIGHMEM))
@@ -1798,7 +1896,7 @@ static bool arch_has_descending_max_zone_pfns(void)
 void __init free_area_init(unsigned long *max_zone_pfn)
 {
 	unsigned long start_pfn, end_pfn;
-	int i, nid, zone;
+	int i, j, nid, zone;
 	bool descending;
 
 	/* Record where the zone boundaries are */
@@ -1810,15 +1908,12 @@ void __init free_area_init(unsigned long *max_zone_pfn)
 	start_pfn = PHYS_PFN(memblock_start_of_DRAM());
 	descending = arch_has_descending_max_zone_pfns();
 
-	for (i = 0; i < MAX_NR_ZONES; i++) {
+	for (i = 0; i <= LAST_PHYS_ZONE; i++) {
 		if (descending)
-			zone = MAX_NR_ZONES - i - 1;
+			zone = LAST_PHYS_ZONE - i;
 		else
 			zone = i;
 
-		if (zone == ZONE_MOVABLE)
-			continue;
-
 		end_pfn = max(max_zone_pfn[zone], start_pfn);
 		arch_zone_lowest_possible_pfn[zone] = start_pfn;
 		arch_zone_highest_possible_pfn[zone] = end_pfn;
@@ -1826,15 +1921,17 @@ void __init free_area_init(unsigned long *max_zone_pfn)
 		start_pfn = end_pfn;
 	}
 
-	/* Find the PFNs that ZONE_MOVABLE begins at in each node */
-	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
-	find_zone_movable_pfns_for_nodes();
+	/* Initialise every node */
+	mminit_verify_pageflags_layout();
+	setup_nr_node_ids();
+	set_pageblock_order();
+
+	/* Find the PFNs that virtual zones begin at in each node */
+	find_virt_zones();
 
 	/* Print out the zone ranges */
 	pr_info("Zone ranges:\n");
-	for (i = 0; i < MAX_NR_ZONES; i++) {
-		if (i == ZONE_MOVABLE)
-			continue;
+	for (i = 0; i <= LAST_PHYS_ZONE; i++) {
 		pr_info("  %-8s ", zone_names[i]);
 		if (arch_zone_lowest_possible_pfn[i] ==
 				arch_zone_highest_possible_pfn[i])
@@ -1847,12 +1944,14 @@ void __init free_area_init(unsigned long *max_zone_pfn)
 					<< PAGE_SHIFT) - 1);
 	}
 
-	/* Print out the PFNs ZONE_MOVABLE begins at in each node */
-	pr_info("Movable zone start for each node\n");
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		if (zone_movable_pfn[i])
-			pr_info("  Node %d: %#018Lx\n", i,
-			       (u64)zone_movable_pfn[i] << PAGE_SHIFT);
+	/* Print out the PFNs virtual zones begin at in each node */
+	for (; i <= LAST_VIRT_ZONE; i++) {
+		pr_info("%s zone start for each node\n", zone_names[i]);
+		for (j = 0; j < MAX_NUMNODES; j++) {
+			if (pfn_of(i, j))
+				pr_info("  Node %d: %#018Lx\n",
+					j, (u64)pfn_of(i, j) << PAGE_SHIFT);
+		}
 	}
 
 	/*
@@ -1868,11 +1967,6 @@ void __init free_area_init(unsigned long *max_zone_pfn)
 		subsection_map_init(start_pfn, end_pfn - start_pfn);
 	}
 
-	/* Initialise every node */
-	mminit_verify_pageflags_layout();
-	setup_nr_node_ids();
-	set_pageblock_order();
-
 	for_each_node(nid) {
 		pg_data_t *pgdat;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1fc99c1ad27e..eca9cb56df4c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -315,6 +315,8 @@ char * const zone_names[MAX_NR_ZONES] = {
 	 "HighMem",
 #endif
 	 "Movable",
+	 "NoSplit",
+	 "NoMerge",
 #ifdef CONFIG_ZONE_DEVICE
 	 "Device",
 #endif
@@ -338,9 +340,9 @@ int user_min_free_kbytes = -1;
 static int watermark_boost_factor __read_mostly = 15000;
 static int watermark_scale_factor = 10;
 
-/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
-int movable_zone;
-EXPORT_SYMBOL(movable_zone);
+/* virt_zone is the "real" zone pages in virtual zones are taken from */
+int virt_zone;
+EXPORT_SYMBOL(virt_zone);
 
 #if MAX_NUMNODES > 1
 unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
@@ -802,9 +804,6 @@ buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
 	unsigned long higher_page_pfn;
 	struct page *higher_page;
 
-	if (order >= MAX_ORDER - 1)
-		return false;
-
 	higher_page_pfn = buddy_pfn & pfn;
 	higher_page = page + (higher_page_pfn - pfn);
 
@@ -812,6 +811,11 @@ buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
 			NULL) != NULL;
 }
 
+static int zone_max_order(struct zone *zone)
+{
+	return zone->order && zone_idx(zone) == ZONE_NOMERGE ? zone->order : MAX_ORDER;
+}
+
 /*
  * Freeing function for a buddy system allocator.
  *
@@ -846,6 +850,7 @@ static inline void __free_one_page(struct page *page,
 	unsigned long combined_pfn;
 	struct page *buddy;
 	bool to_tail;
+	int max_order = zone_max_order(zone);
 
 	VM_BUG_ON(!zone_is_initialized(zone));
 	VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
@@ -857,7 +862,7 @@ static inline void __free_one_page(struct page *page,
 	VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
 	VM_BUG_ON_PAGE(bad_range(zone, page), page);
 
-	while (order < MAX_ORDER) {
+	while (order < max_order) {
 		if (compaction_capture(capc, page, order, migratetype)) {
 			__mod_zone_freepage_state(zone, -(1 << order),
 								migratetype);
@@ -904,6 +909,8 @@ done_merging:
 		to_tail = true;
 	else if (is_shuffle_order(order))
 		to_tail = shuffle_pick_tail();
+	else if (order + 1 >= max_order)
+		to_tail = false;
 	else
 		to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
 
@@ -941,6 +948,8 @@ int split_free_page(struct page *free_page,
 	int mt;
 	int ret = 0;
 
+	VM_WARN_ON_ONCE_PAGE(!page_can_split(free_page), free_page);
+
 	if (split_pfn_offset == 0)
 		return ret;
 
@@ -1652,6 +1661,8 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 	struct free_area *area;
 	struct page *page;
 
+	VM_WARN_ON_ONCE(!zone_is_suitable(zone, order));
+
 	/* Find a page of the appropriate size in the preferred list */
 	for (current_order = order; current_order < NR_PAGE_ORDERS; ++current_order) {
 		area = &(zone->free_area[current_order]);
@@ -2987,6 +2998,9 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
 	long min = mark;
 	int o;
 
+	if (!zone_is_suitable(z, order))
+		return false;
+
 	/* free_pages may go negative - that's OK */
 	free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
 
@@ -3079,6 +3093,9 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
 {
 	long free_pages;
 
+	if (!zone_is_suitable(z, order))
+		return false;
+
 	free_pages = zone_page_state(z, NR_FREE_PAGES);
 
 	/*
@@ -3227,6 +3244,9 @@ retry:
 		struct page *page;
 		unsigned long mark;
 
+		if (!zone_is_suitable(zone, order))
+			continue;
+
 		if (cpusets_enabled() &&
 			(alloc_flags & ALLOC_CPUSET) &&
 			!__cpuset_zone_allowed(zone, gfp_mask))
@@ -3906,6 +3926,9 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
 
 	alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
 
+	if (!(gfp_mask & __GFP_DIRECT_RECLAIM) && gfp_order_zone(gfp_mask, order) > ZONE_MOVABLE)
+		alloc_flags |= ALLOC_KSWAPD;
+
 	return alloc_flags;
 }
 
@@ -5844,9 +5867,9 @@ static void __setup_per_zone_wmarks(void)
 	struct zone *zone;
 	unsigned long flags;
 
-	/* Calculate total number of !ZONE_HIGHMEM and !ZONE_MOVABLE pages */
+	/* Calculate total number of pages below ZONE_HIGHMEM */
 	for_each_zone(zone) {
-		if (!is_highmem(zone) && zone_idx(zone) != ZONE_MOVABLE)
+		if (zone_idx(zone) <= ZONE_NORMAL)
 			lowmem_pages += zone_managed_pages(zone);
 	}
 
@@ -5856,11 +5879,11 @@ static void __setup_per_zone_wmarks(void)
 		spin_lock_irqsave(&zone->lock, flags);
 		tmp = (u64)pages_min * zone_managed_pages(zone);
 		do_div(tmp, lowmem_pages);
-		if (is_highmem(zone) || zone_idx(zone) == ZONE_MOVABLE) {
+		if (zone_idx(zone) > ZONE_NORMAL) {
 			/*
 			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
-			 * need highmem and movable zones pages, so cap pages_min
-			 * to a small  value here.
+			 * need pages from zones above ZONE_NORMAL, so cap
+			 * pages_min to a small value here.
 			 *
 			 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
 			 * deltas control async page reclaim, and so should
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index ef441fa45563..142b9ac6c97a 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -70,7 +70,7 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e
 		 * pages then it should be reasonably safe to assume the rest
 		 * is movable.
 		 */
-		if (zone_idx(zone) == ZONE_MOVABLE)
+		if (zid_is_virt(zone_idx(zone)))
 			continue;
 
 		/*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index aacbd03773a4..e610baa18413 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1918,18 +1918,13 @@ retry:
 					goto keep_locked;
 				if (folio_maybe_dma_pinned(folio))
 					goto keep_locked;
-				if (folio_test_large(folio)) {
-					/* cannot split folio, skip it */
-					if (!can_split_folio(folio, NULL))
-						goto activate_locked;
-					/*
-					 * Split partially mapped folios right away.
-					 * We can free the unmapped pages without IO.
-					 */
-					if (data_race(!list_empty(&folio->_deferred_list)) &&
-					    split_folio_to_list(folio, folio_list))
-						goto activate_locked;
-				}
+				/*
+				 * Split partially mapped folios right away.
+				 * We can free the unmapped pages without IO.
+				 */
+				if (folio_test_large(folio) &&
+				    data_race(!list_empty(&folio->_deferred_list)))
+					split_folio_to_list(folio, folio_list);
 				if (!add_to_swap(folio)) {
 					int __maybe_unused order = folio_order(folio);
 
@@ -6824,7 +6819,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 	orig_mask = sc->gfp_mask;
 	if (buffer_heads_over_limit) {
 		sc->gfp_mask |= __GFP_HIGHMEM;
-		sc->reclaim_idx = gfp_zone(sc->gfp_mask);
+		sc->reclaim_idx = gfp_order_zone(sc->gfp_mask, sc->order);
 	}
 
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
@@ -7154,7 +7149,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 	struct scan_control sc = {
 		.nr_to_reclaim = SWAP_CLUSTER_MAX,
 		.gfp_mask = current_gfp_context(gfp_mask),
-		.reclaim_idx = gfp_zone(gfp_mask),
+		.reclaim_idx = gfp_order_zone(gfp_mask, order),
 		.order = order,
 		.nodemask = nodemask,
 		.priority = DEF_PRIORITY,
@@ -7920,6 +7915,10 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
 	if (!cpuset_zone_allowed(zone, gfp_flags))
 		return;
 
+	curr_idx = gfp_order_zone(gfp_flags, order);
+	if (highest_zoneidx > curr_idx)
+		highest_zoneidx = curr_idx;
+
 	pgdat = zone->zone_pgdat;
 	curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
 
@@ -8129,7 +8128,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 		.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
 		.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
 		.may_swap = 1,
-		.reclaim_idx = gfp_zone(gfp_mask),
+		.reclaim_idx = gfp_order_zone(gfp_mask, order),
 	};
 	unsigned long pflags;
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 6fac2f128802..4bcb6ec8a5b9 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1163,6 +1163,7 @@ int fragmentation_index(struct zone *zone, unsigned int order)
 
 #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
 					TEXT_FOR_HIGHMEM(xx) xx "_movable", \
+					xx "_nosplit", xx "_nomerge", \
 					TEXT_FOR_DEVICE(xx)
 
 const char * const vmstat_text[] = {
@@ -1692,7 +1693,8 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   "\n        spanned  %lu"
 		   "\n        present  %lu"
 		   "\n        managed  %lu"
-		   "\n        cma      %lu",
+		   "\n        cma      %lu"
+		   "\n        order    %u",
 		   zone_page_state(zone, NR_FREE_PAGES),
 		   zone->watermark_boost,
 		   min_wmark_pages(zone),
@@ -1701,7 +1703,8 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   zone->spanned_pages,
 		   zone->present_pages,
 		   zone_managed_pages(zone),
-		   zone_cma_pages(zone));
+		   zone_cma_pages(zone),
+		   zone->order);
 
 	seq_printf(m,
 		   "\n        protection: (%ld",