Merge android15-6.6 into android15-6.6-lts

This merges the android15-6.6 branch into the -lts branch, catching it up with the latest changes in there. It contains the following commits: * 3a0107a38e ANDROID: KVM: arm64: Ensure SVE initialization precedes PSCI for protected VCPUs * 3b75103301 ANDROID: 16K: Use vma_area slab cache for pad VMA * a213abada8 UPSTREAM: af_unix: Fix uninit-value in __unix_walk_scc() * 5156d49ed9 UPSTREAM: af_unix: Fix garbage collection of embryos carrying OOB with SCM_RIGHTS * fbd783363d ANDROID: af_unix: Provide ABI fixes for recently introduced 'struct scm_fp_list' attribute 'dead' * ddd6979a15 UPSTREAM: af_unix: Add dead flag to struct scm_fp_list. * 95a397ac6b UPSTREAM: af_unix: Don't access successor in unix_del_edges() during GC. * a130d07d24 UPSTREAM: af_unix: Try not to hold unix_gc_lock during accept(). * 5ada288086 UPSTREAM: af_unix: Remove lock dance in unix_peek_fds(). * 11d208f893 UPSTREAM: af_unix: Replace garbage collection algorithm. * 67a3a58da1 UPSTREAM: af_unix: Detect dead SCC. * b9f8dfdb54 UPSTREAM: af_unix: Assign a unique index to SCC. * b22b0a7597 UPSTREAM: af_unix: Avoid Tarjan's algorithm if unnecessary. * 1e4d62adeb UPSTREAM: af_unix: Skip GC if no cycle exists. * 250c362acd UPSTREAM: af_unix: Save O(n) setup of Tarjan's algo. * 0c40a05117 UPSTREAM: af_unix: Fix up unix_edge.successor for embryo socket. * f5ea8b439d UPSTREAM: af_unix: Save listener for embryo socket. * 279ed20d5f UPSTREAM: af_unix: Detect Strongly Connected Components. * 16dca90335 UPSTREAM: af_unix: Iterate all vertices by DFS. * 80df4d17af UPSTREAM: af_unix: Bulk update unix_tot_inflight/unix_inflight when queuing skb. * 40549e6976 ANDROID: af_unix: Provide ABI fixes for recently introduced 'struct scm_fp_list' attribute 'inflight' * 769fc01f23 UPSTREAM: af_unix: Link struct unix_edge when queuing skb. * de6b1e85b9 ANDROID: af_unix: Provide ABI fixes for recently introduced 'struct scm_fp_list' attribute 'edges' * 844c9666eb UPSTREAM: af_unix: Allocate struct unix_edge for each inflight AF_UNIX fd. * c93b3ba51e ANDROID: af_unix: Provide ABI fixes for recently introduced 'struct scm_fp_list' attribute 'vertices' * ffef32ddaf UPSTREAM: af_unix: Allocate struct unix_vertex for each inflight AF_UNIX fd. * f972f2d7b1 ANDROID: af_unix: Allocate memory for the largest possible size of 'struct scm_fp_list' * b077571da9 UPSTREAM: af_unix: Remove CONFIG_UNIX_SCM. * a390e62751 ANDROID: Align x86-64 microdroid cgroup support with aarch64 microdroid * 6dbb3c2e90 BACKPORT: mm: remove folio from deferred split list before uncharging it * a8553b4e2a BACKPORT: mm: use __page_cache_release() in folios_put() * 4d61851d14 UPSTREAM: mm: fix list corruption in put_pages_list * f61f355bdc UPSTREAM: mm: use free_unref_folios() in put_pages_list() * 316b2e6e4b BACKPORT: mm: remove use of folio list from folios_put() * f9c6fb1b82 BACKPORT: memcg: add mem_cgroup_uncharge_folios() * 3bc695b2be Merge tag 'android15-6.6.92_r00' into android15-6.6 * 0813441033 FROMGIT: scsi: core: ufs: Fix a hang in the error handler * a74f052176 FROMGIT: genirq/cpuhotplug: Restore affinity even for suspended IRQ * fc6844d9d2 FROMGIT: genirq/cpuhotplug: Rebalance managed interrupts across multi-CPU hotplug * 0bc63a98d9 ANDROID: abi_gki_aarch64_vivo: Update symbol list * 8fb77f6f9d ANDROID: mm: Reset unused page flag bits on free * f0bd864fe0 Revert "ANDROID: mm: Set PAGE_BLOCK_ORDER to 7 when ARM64_16K_PAGES" * 97f5b70ad3 ANDROID: GKI: Update symbol list for xiaomi * 2bc7bc937c BACKPORT: erofs: lazily initialize per-CPU workers and CPU hotplug hooks * 434940a426 FROMGIT: scsi: ufs: mcq: Delete ufshcd_release_scsi_cmd() in ufshcd_mcq_abort() * 0ac9aa9b62 ANDROID: GKI: Rename xring's symbol list. * f56b0532df BACKPORT: mm: set pageblock_order to HPAGE_PMD_ORDER in case with !CONFIG_HUGETLB_PAGE but THP enabled * f19494634f ANDROID: GKI: Update symbol list for vivo * 68191d9c7a ANDROID: vendor_hooks: add hook to retry mempool allocation without delay * 45afa56280 ANDROID: mm: Set PAGE_BLOCK_ORDER to 7 when ARM64_16K_PAGES * 3148030c78 ANDROID: KVM: arm64: Fix hyp_alloc(0) * 4ec55296c6 ANDROID: fix out-of-bounds error when trace_create_new_event * d9ec0e18f4 ANDROID: CONFIG_CRYPTO_SHA1_ARM64_CE=y to GKI and Microdroid kernel * 0272a2ffdc BACKPORT: FROMGIT: mm: add CONFIG_PAGE_BLOCK_ORDER to select page block order * 86ba3f3eb2 BACKPORT: binder: Create safe versions of binder log files * 8a55e7a02a UPSTREAM: binder: Refactor binder_node print synchronization * fe02cfa135 ANDROID: iommu/arm-smmu-v3-kvm: Fix accidental domain ID freeing in free() * 9733cd1fa2 ANDROID: GKI: Update xiaomi symbol list. * 125f87a148 UPSTREAM: mm/memcg: use kmem_cache when alloc memcg pernode info * 78e6a3d422 UPSTREAM: mm/memcg: use kmem_cache when alloc memcg * b6bde4b648 UPSTREAM: mm/memcg: move mem_cgroup_init() ahead of cgroup_init() * 476cb9bc9b UPSTREAM: af_unix: Remove io_uring code for GC. * fb219cbb0b UPSTREAM: af_unix: Replace BUG_ON() with WARN_ON_ONCE(). * 3c39219343 ANDROID: Enable memory controller for microdroid * c6325b075d ANDROID: cgroup: Fix cgroup_root backport padding calculation * 452d899d2f ANDROID: GKI: Fix up abi issue in struct scm_fp_list * cec9cb02ce UPSTREAM: af_unix: Try to run GC async. * 93c2d24134 BACKPORT: FROMGIT: usb: typec: tcpm: move tcpm_queue_vdm_unlocked to asynchronous work * ee016b98b7 BACKPORT: usb: typec: tcpm: enforce ready state when queueing alt mode vdm * 4be94a6b03 ANDROID: ABI: Update pixel symbol list * 6af2e78f07 ANDROID: fix ABI breakage for trace_array extensions * 6f62c0d0fb UPSTREAM: tracing: Allow creating instances with specified system events * f8d73c6178 UPSTREAM: af_unix: Run GC on only one CPU. * a70bd568b1 UPSTREAM: af_unix: Return struct unix_sock from unix_get_socket(). * c1b974e51d UPSTREAM: iommu: Handle race with default domain setup * 315fdde476 ANDROID: ABI: Update pixel symbol list * 32288ce2f2 ANDROID: vendor_hooks: Add hooks for xhci reset * dd8fcb5398 ANDROID: GKI: deferred split queue corruption - ABI fixup * 374babecde UPSTREAM: mm/thp: fix deferred split queue not partially_mapped: fix * 3a8faa5b25 BACKPORT: mm/thp: fix deferred split unqueue naming and locking * 84cc354617 UPSTREAM: mm/thp: fix deferred split queue not partially_mapped * dd46964f3e BACKPORT: mm: add sysfs entry to disable splitting underused THPs * 40ffd525e5 UPSTREAM: mm: split underused THPs * a63eadb11d BACKPORT: mm: introduce a pageflag for partially mapped folios * f1b73b0513 UPSTREAM: mm/migrate: fix kernel BUG at mm/compaction.c:2761! * cbbd153073 BACKPORT: mm/migrate: split source folio if it is on deferred split list * c6f085c328 BACKPORT: mm: count the number of partially mapped anonymous THPs per size * 545db6094c BACKPORT: mm: count the number of anonymous THPs per size * 6ee860d0d4 UPSTREAM: mm: separate out FOLIO_FLAGS from PAGEFLAGS * f052bbc24d UPSTREAM: mm: selftest to verify zero-filled pages are mapped to zeropage * d826c84482 BACKPORT: mm: remap unused subpages to shared zeropage when splitting isolated thp * bc9f1a0f43 Revert "BACKPORT: mm/thp: fix deferred split unqueue naming and locking" * c06fa3b5cd ANDROID: GKI: page_alloc ABI fixup * 819bdc71dc BACKPORT: mm: page_alloc: batch vmstat updates in expand() * c97dfdfac0 UPSTREAM: mm/page_alloc: keep track of free highatomic * cdff4faf2b UPSTREAM: mm: remove unused has_isolate_pageblock * 5b5902fcf6 UPSTREAM: mm: page_alloc: fix highatomic typing in multi-block buddies * 48e8763a95 BACKPORT: mm: page_alloc: consolidate free page accounting * a4f7bd4b3d BACKPORT: mm: page_isolation: prepare for hygienic freelists * a8dcfbc68b UPSTREAM: mm: page_alloc: set migratetype inside move_freepages() * 209c219a0f BACKPORT: mm: page_alloc: close migratetype race between freeing and stealing * 1a3654f59a BACKPORT: mm: page_alloc: fix freelist movement during block conversion * 861e9d3c44 UPSTREAM: mm: page_alloc: fix move_freepages_block() range error * 350c3b1d61 UPSTREAM: mm: page_alloc: move free pages when converting block during isolation * f76299151c UPSTREAM: mm: page_alloc: fix up block types when merging compatible blocks * cb610236ed UPSTREAM: mm: page_alloc: optimize free_unref_folios() * 606130dacb BACKPORT: mm: page_alloc: remove pcppage migratetype caching * a7a880e6de UPSTREAM: mm: allow non-hugetlb large folios to be batch processed * f17c4db9cf BACKPORT: mm: handle large folios in free_unref_folios() * c7f67cfb85 UPSTREAM: mm: use folios_put() in __folio_batch_release() * 445fa9a71a BACKPORT: mm: add free_unref_folios() * cc058410b3 BACKPORT: mm: convert free_unref_page_list() to use folios * 980cb4e2ba BACKPORT: mm: make folios_put() the basis of release_pages() * 5f4ed005d7 Revert "BACKPORT: mm: page_alloc: remove pcppage migratetype caching" * bab99c1b7e Revert "UPSTREAM: mm: page_alloc: fix up block types when merging compatible blocks" * 94e3afbb3d Revert "UPSTREAM: mm: page_alloc: move free pages when converting block during isolation" * 13aa15180a Revert "UPSTREAM: mm: page_alloc: fix move_freepages_block() range error" * d47518de38 Revert "UPSTREAM: mm: page_alloc: fix freelist movement during block conversion" * 135ab7374e Revert "BACKPORT: mm: page_alloc: close migratetype race between freeing and stealing" * 9ed2d2fba2 Revert "UPSTREAM: mm: page_alloc: set migratetype inside move_freepages()" * efbdb11ac1 Revert "BACKPORT: mm: page_isolation: prepare for hygienic freelists" * 7d424e0f80 Revert "BACKPORT: mm: page_alloc: consolidate free page accounting" * 8a91cd1d26 Revert "BACKPORT: mm: page_alloc: batch vmstat updates in expand()" * be6d3cc085 Revert "UPSTREAM: mm: page_alloc: fix highatomic typing in multi-block buddies" * bbc65a78d2 Revert "BACKPORT: mm/page_alloc: keep track of free highatomic" * a7a0d95bca Revert "BACKPORT: mm: page_alloc: optimize free_unref_folios()" * 8b5d78fb5c Revert "ANDROID: fuse-bpf: fix wrong logic in read backing" * c1488e58c3 ANDROID: GKI: Update symbol list for Nvidia * 1e3d640b05 ANDROID: GKI: Add initial Nvidia symbol list * 5fa476bd0b ANDROID: Add ufs headers to aarch64 allowlist * 17daf81bcc ANDROID: KVM: arm64: Allow relinqush for p-guest with huge-mappings * 297e1ff805 ANDROID: KVM: arm64: Use unmap for pKVM guests memory relinquish * 7c95a219c0 ANDROID: KVM: arm64: Add hyp request SPLIT * e56d181356 ANDROID: KVM: arm64: Convert kvm_pinned_pages to an interval-tree * 390699f93d ANDROID: KVM: arm64: Add host_split_guest for pKVM * 16df80ab9c ANDROID: KVM: arm64: Disable relinquish for p-guest huge-mappings * 549ac47ca0 FROMGIT: PM: runtime: fix denying of auto suspend in pm_suspend_timer_fn() * 4cdfd02ff2 ANDROID: Enable SHA1 for microdroid * ab0ad8d198 BACKPORT: mm: page_alloc: optimize free_unref_folios() Change-Id: Ic5571553dd22417e2ff66c8e99c114b8d79476f2 Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
2025-06-25 09:43:52 +00:00
parent fc57b3829f 3a0107a38e
commit e6f212b36a
87 changed files with 3280 additions and 989 deletions
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -126,6 +126,7 @@ filegroup(
        "android/abi_gki_aarch64_mtk",
        "android/abi_gki_aarch64_mtktv",
        "android/abi_gki_aarch64_nothing",
+        "android/abi_gki_aarch64_nvidia",
        "android/abi_gki_aarch64_oplus",
        "android/abi_gki_aarch64_paragon",
        "android/abi_gki_aarch64_pixel",
@@ -140,7 +141,7 @@ filegroup(
        "android/abi_gki_aarch64_virtual_device",
        "android/abi_gki_aarch64_vivo",
        "android/abi_gki_aarch64_xiaomi",
-        "android/abi_gki_aarch64_xiaomi2",
+        "android/abi_gki_aarch64_xiaomi_xring",
    ],
    visibility = ["//visibility:public"],
 )
@@ -1028,6 +1029,9 @@ ddk_headers(
        "drivers/pci/controller/dwc/pcie-designware.h",
        "drivers/thermal/thermal_core.h",
        "drivers/thermal/thermal_netlink.h",
+        "drivers/ufs/core/ufshcd-crypto.h",
+        "drivers/ufs/core/ufshcd-priv.h",
+        "drivers/ufs/host/ufshcd-pltfrm.h",
        "drivers/usb/dwc3/core.h",
        "sound/usb/card.h",
        "sound/usb/usbaudio.h",
@@ -1045,6 +1049,7 @@ ddk_headers(
        "drivers/extcon",
        "drivers/pci/controller/dwc",
        "drivers/thermal",
+        "drivers/ufs",
        "drivers/usb",
        "sound/usb",
        "include",
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -202,6 +202,16 @@ PMD-mappable transparent hugepage::

 	cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size

+All THPs at fault and collapse time will be added to _deferred_list,
+and will therefore be split under memory presure if they are considered
+"underused". A THP is underused if the number of zero-filled pages in
+the THP is above max_ptes_none (see below). It is possible to disable
+this behaviour by writing 0 to shrink_underused, and enable it by writing
+1 to it::
+
+	echo 0 > /sys/kernel/mm/transparent_hugepage/shrink_underused
+	echo 1 > /sys/kernel/mm/transparent_hugepage/shrink_underused
+
 khugepaged will be automatically started when one or more hugepage
 sizes are enabled (either by directly setting "always" or "madvise",
 or by setting "inherit" while the top-level enabled is set to "always"
@@ -443,6 +453,12 @@ thp_deferred_split_page
 	splitting it would free up some memory. Pages on split queue are
 	going to be split under memory pressure.

+thp_underused_split_page
+	is incremented when a huge page on the split queue was split
+	because it was underused. A THP is underused if the number of
+	zero pages in the THP is above a certain threshold
+	(/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none).
+
 thp_split_pmd
 	is incremented every time a PMD split into table of PTEs.
 	This can happen, for instance, when application calls mprotect() or
@@ -510,6 +526,18 @@ split_deferred
 	it would free up some memory. Pages on split queue are going to
 	be split under memory pressure, if splitting is possible.

+nr_anon
+       the number of anonymous THP we have in the whole system. These THPs
+       might be currently entirely mapped or have partially unmapped/unused
+       subpages.
+
+nr_anon_partially_mapped
+       the number of anonymous THP which are likely partially mapped, possibly
+       wasting memory, and have been queued for deferred memory reclamation.
+       Note that in corner some cases (e.g., failed migration), we might detect
+       an anonymous THP as "partially mapped" and count it here, even though it
+       is not actually partially mapped anymore.
+
 As the system ages, allocating huge pages may be expensive as the
 system uses memory compaction to copy data around memory to free a
 huge page for use. There are some counters in ``/proc/vmstat`` to help
--- a/android/abi_gki_aarch64.stg
+++ b/android/abi_gki_aarch64.stg
--- a/android/abi_gki_aarch64.stg.allowed_breaks
+++ b/android/abi_gki_aarch64.stg.allowed_breaks
@@ -132,3 +132,84 @@ type 'struct io_ring_ctx' changed
 1 variable symbol(s) removed
  'struct tracepoint __tracepoint_android_vh_filemap_fault_before_folio_locked'

+type 'struct kvm_protected_vm' changed
+  member 'struct maple_tree pinned_pages' was removed
+  member 'union { struct rb_root_cached pinned_pages; struct { struct maple_tree __unused; }; union { }; }' was added
+
+type 'struct kvm_hyp_req' changed
+  member changed from 'union { struct { u8 dest; int nr_pages; int sz_alloc; } mem; struct { unsigned long guest_ipa; size_t size; } map; }' to 'union { struct { u8 dest; int nr_pages; int sz_alloc; } mem; struct { unsigned long guest_ipa; size_t size; } map; struct { unsigned long guest_ipa; size_t size; } split; }'
+    type changed from 'union { struct { u8 dest; int nr_pages; int sz_alloc; } mem; struct { unsigned long guest_ipa; size_t size; } map; }' to 'union { struct { u8 dest; int nr_pages; int sz_alloc; } mem; struct { unsigned long guest_ipa; size_t size; } map; struct { unsigned long guest_ipa; size_t size; } split; }'
+      member 'struct { unsigned long guest_ipa; size_t size; } split' was added
+
+type 'struct scm_fp_list' changed
+  byte size changed from 2040 to 2048
+  member 'short count_unix' was added
+
+type 'struct scm_fp_list' changed
+  byte size changed from 2048 to 2064
+  member 'struct list_head vertices' was added
+  member 'short count_unix' changed
+    offset changed by 128
+
+type 'struct scm_fp_list' changed
+  byte size changed from 2064 to 2072
+  member 'struct unix_edge* edges' was added
+  member 'short count_unix' changed
+    offset changed by 64
+
+type 'struct scm_fp_list' changed
+  byte size changed from 2072 to 2080
+  member 'bool inflight' was added
+  3 members ('struct list_head vertices' .. 'short count_unix') changed
+    offset changed by 64
+
+type 'struct unix_edge' changed
+  byte size changed from 32 to 48
+  member 'struct list_head stack_entry' was added
+
+type 'struct unix_vertex' changed
+  byte size changed from 40 to 48
+  member 'unsigned long index' was added
+
+type 'struct unix_vertex' changed
+  byte size changed from 48 to 80
+  member 'struct list_head scc_entry' was added
+  2 members ('unsigned long out_degree' .. 'unsigned long index') changed
+    offset changed by 128
+  member 'unsigned long lowlink' was added
+  member 'bool on_stack' was added
+
+type 'struct unix_sock' changed
+  member 'struct sock* listener' was added
+  4 members ('struct list_head link' .. 'unsigned long gc_flags') changed
+    offset changed by 64
+
+type 'struct unix_vertex' changed
+  byte size changed from 80 to 72
+  member 'bool on_stack' was removed
+
+type 'struct unix_vertex' changed
+  member 'unsigned long lowlink' was removed
+  member 'unsigned long scc_index' was added
+
+type 'struct unix_sock' changed
+  byte size changed from 1216 to 1152
+  member 'struct list_head link' was removed
+  member 'unsigned long inflight' was removed
+  member 'spinlock_t lock' changed
+    offset changed by -192
+  member 'unsigned long gc_flags' was removed
+  4 members ('struct socket_wq peer_wq' .. 'struct sk_buff* oob_skb') changed
+    offset changed by -512
+
+type 'struct unix_sock' changed
+  member 'struct sk_buff* oob_skb' changed
+    offset changed by 64
+
+type 'struct scm_stat' changed
+  byte size changed from 4 to 16
+  member 'unsigned long nr_unix_fds' was added
+
+type 'struct scm_fp_list' changed
+  member 'bool dead' was added
+
--- a/android/abi_gki_aarch64_nvidia
+++ b/android/abi_gki_aarch64_nvidia
@@ -0,0 +1,232 @@
+[abi_symbol_list]
+# commonly used symbols
+  alloc_chrdev_region
+  alt_cb_patch_nops
+  __arch_copy_from_user
+  __arch_copy_to_user
+  cdev_add
+  cdev_del
+  cdev_init
+  __check_object_size
+  class_create
+  class_destroy
+  complete
+  dev_driver_string
+  _dev_err
+  device_create
+  device_destroy
+  _dev_info
+  devm_kfree
+  devm_kmalloc
+  devm_memremap
+  devm_request_threaded_irq
+  _dev_warn
+  fortify_panic
+  free_irq
+  __init_swait_queue_head
+  init_timer_key
+  __init_waitqueue_head
+  jiffies_to_usecs
+  kfree
+  __kmalloc
+  kmalloc_caches
+  kmalloc_trace
+  kstrtouint
+  log_post_read_mmio
+  log_read_mmio
+  memcpy
+  __memcpy_fromio
+  memset
+  module_layout
+  __mutex_init
+  mutex_lock
+  mutex_unlock
+  of_find_property
+  of_property_read_u32_index
+  of_property_read_variable_u32_array
+  panic
+  pid_task
+  __platform_driver_register
+  platform_driver_unregister
+  _printk
+  __put_task_struct
+  _raw_spin_lock
+  _raw_spin_unlock
+  request_threaded_irq
+  schedule_timeout
+  snprintf
+  __stack_chk_fail
+  strlen
+  strncmp
+  strnlen
+  strscpy
+  sysfs_create_group
+  sysfs_remove_group
+  system_cpucaps
+  system_wq
+  tegra_ivc_notified
+  tegra_ivc_read_advance
+  tegra_ivc_read_get_next_frame
+  tegra_ivc_reset
+  tegra_ivc_write_advance
+  tegra_ivc_write_get_next_frame
+  __traceiter_rwmmio_post_read
+  __traceiter_rwmmio_read
+  __tracepoint_rwmmio_post_read
+  __tracepoint_rwmmio_read
+  unregister_chrdev_region
+  __wake_up
+  __warn_printk
+
+# required by ivc-cdev.ko
+  device_del
+  devm_free_irq
+  noop_llseek
+  remap_pfn_range
+
+# required by ivc_ext.ko
+  dma_sync_single_for_cpu
+  __memcpy_toio
+
+# required by nvsciipc.ko
+  _dev_notice
+  __fdget
+  find_get_pid
+  fput
+  platform_device_register_full
+  platform_device_unregister
+  sprintf
+
+# required by tegra_bpmp.ko
+  clk_hw_determine_rate_no_reparent
+  clk_hw_get_name
+  clk_hw_unregister
+  debugfs_create_dir
+  debugfs_create_file
+  debugfs_remove
+  dentry_path_raw
+  devm_clk_hw_register
+  devm_reset_controller_register
+  dma_alloc_attrs
+  dma_free_attrs
+  _find_next_bit
+  kmalloc_large
+  kstrdup
+  ktime_get
+  of_clk_add_hw_provider
+  of_device_get_match_data
+  of_genpd_add_provider_onecell
+  __of_parse_phandle_with_args
+  of_platform_default_populate
+  pm_genpd_init
+  pm_genpd_remove
+  seq_lseek
+  seq_read
+  seq_write
+  single_open_size
+  single_release
+  strncpy
+  tegra_bpmp_free_mrq
+  tegra_bpmp_mrq_is_supported
+  tegra_bpmp_mrq_return
+  tegra_bpmp_request_mrq
+  tegra_bpmp_transfer
+  tegra_bpmp_transfer_atomic
+  tegra_sku_info
+
+# required by tegra_hv.ko
+  arm64_use_ng_mappings
+  class_create_file_ns
+  ioremap_prot
+  iounmap
+  irq_get_irq_data
+  memstart_addr
+  of_add_property
+  of_chosen
+  of_find_compatible_node
+  of_irq_get
+  pfn_is_map_memory
+  tegra_ivc_init
+
+# required by tegra_hv_pm_ctl.ko
+  __alloc_skb
+  find_vpid
+  finish_wait
+  init_net
+  init_wait_entry
+  msleep
+  __netlink_kernel_create
+  netlink_unicast
+  __nlmsg_put
+  prepare_to_wait_event
+  register_pm_notifier
+  schedule
+  strcmp
+  wait_for_completion_timeout
+
+# required by tegra_hv_vblk_oops.ko
+  delayed_work_timer_fn
+  dma_map_page_attrs
+  __get_free_pages
+  is_vmalloc_addr
+  queue_delayed_work_on
+
+# required by tegra_vblk.ko
+  blk_execute_rq
+  blk_mq_alloc_disk_for_queue
+  blk_mq_alloc_request
+  blk_mq_alloc_tag_set
+  blk_mq_destroy_queue
+  blk_mq_end_request
+  blk_mq_free_request
+  blk_mq_free_tag_set
+  blk_mq_init_queue
+  blk_mq_start_hw_queues
+  blk_mq_start_request
+  blk_mq_stop_hw_queues
+  blk_queue_flag_set
+  blk_queue_logical_block_size
+  blk_queue_max_discard_sectors
+  blk_queue_max_hw_sectors
+  blk_queue_max_secure_erase_sectors
+  blk_queue_physical_block_size
+  blk_queue_write_cache
+  __blk_rq_map_sg
+  capable
+  __cpu_possible_mask
+  del_gendisk
+  device_add_disk
+  device_create_file
+  disable_irq
+  disk_check_media_change
+  dma_map_sg_attrs
+  dma_unmap_sg_attrs
+  enable_irq
+  _find_first_zero_bit
+  jiffies
+  kasan_flag_enabled
+  kthread_create_on_cpu
+  kthread_create_on_node
+  __list_add_valid_or_report
+  __list_del_entry_valid_or_report
+  mod_timer
+  __num_online_cpus
+  of_find_node_by_name
+  put_disk
+  queue_work_on
+  _raw_spin_lock_irqsave
+  _raw_spin_unlock_irqrestore
+  __register_blkdev
+  sched_setattr_nocheck
+  set_capacity
+  set_disk_ro
+  sg_init_table
+  sg_nents
+  __sw_hweight64
+  timer_delete
+  unregister_blkdev
+  vfree
+  vzalloc
+  wait_for_completion
+  wait_for_completion_interruptible
+  wake_up_process
--- a/android/abi_gki_aarch64_pixel
+++ b/android/abi_gki_aarch64_pixel
@@ -883,6 +883,7 @@
  drm_mode_duplicate
  drm_mode_equal
  drm_mode_equal_no_clocks
+  drm_mode_is_420_only
  drm_mode_object_find
  drm_mode_object_get
  drm_mode_object_put
@@ -2620,6 +2621,7 @@
  touch_softlockup_watchdog
  trace_array_destroy
  trace_array_get_by_name
+  trace_array_get_by_name_ext
  trace_array_put
  trace_array_set_clr_event
  trace_event_buffer_commit
@@ -2731,6 +2733,7 @@
  __traceiter_android_vh_ufs_update_sysfs
  __traceiter_android_vh_usb_dev_resume
  __traceiter_android_vh_use_amu_fie
+  __traceiter_android_vh_xhci_full_reset_on_remove
  __traceiter_clock_set_rate
  __traceiter_cma_alloc_finish
  __traceiter_cma_alloc_start
@@ -2869,6 +2872,7 @@
  __tracepoint_android_vh_ufs_update_sysfs
  __tracepoint_android_vh_usb_dev_resume
  __tracepoint_android_vh_use_amu_fie
+  __tracepoint_android_vh_xhci_full_reset_on_remove
  __tracepoint_clock_set_rate
  __tracepoint_cma_alloc_finish
  __tracepoint_cma_alloc_start
--- a/android/abi_gki_aarch64_vivo
+++ b/android/abi_gki_aarch64_vivo
@@ -154,6 +154,8 @@
  __traceiter_android_vh_look_around_migrate_folio
  __traceiter_android_vh_lruvec_add_folio
  __traceiter_android_vh_lruvec_del_folio
+  __traceiter_android_vh_mempool_alloc_skip_wait
+  __traceiter_android_vh_mm_free_page
  __traceiter_android_vh_mmap_region
  __traceiter_android_vh_mutex_init
  __traceiter_android_vh_mutex_unlock_slowpath
@@ -284,6 +286,8 @@
  __tracepoint_android_vh_look_around_migrate_folio
  __tracepoint_android_vh_lruvec_add_folio
  __tracepoint_android_vh_lruvec_del_folio
+  __tracepoint_android_vh_mempool_alloc_skip_wait
+  __tracepoint_android_vh_mm_free_page
  __tracepoint_android_vh_mmap_region
  __tracepoint_android_vh_mutex_init
  __tracepoint_android_vh_mutex_unlock_slowpath
--- a/android/abi_gki_aarch64_xiaomi
+++ b/android/abi_gki_aarch64_xiaomi
@@ -23,6 +23,8 @@
  __tracepoint_android_vh_tune_swappiness
  __traceiter_android_vh_do_shrink_slab_ex
  __tracepoint_android_vh_do_shrink_slab_ex
+  __traceiter_android_vh_migration_target_bypass
+  __tracepoint_android_vh_migration_target_bypass

 # required by lz4 decompress module
  __tracepoint_android_vh_lz4_decompress_bypass
--- a/android/abi_gki_aarch64_xiaomi_xring
+++ b/android/abi_gki_aarch64_xiaomi_xring
@@ -1911,6 +1911,7 @@
  scsi_report_bus_reset
  scsi_scan_host
  scsi_unblock_requests
+  scsi_host_busy
  sdev_prefix_printk
  security_file_ioctl
  select_fallback_rq
--- a/arch/arm64/configs/gki_defconfig
+++ b/arch/arm64/configs/gki_defconfig
@@ -737,6 +737,7 @@ CONFIG_CRYPTO_LZ4=y
 CONFIG_CRYPTO_ZSTD=y
 CONFIG_CRYPTO_ANSI_CPRNG=y
 CONFIG_CRYPTO_GHASH_ARM64_CE=y
+CONFIG_CRYPTO_SHA1_ARM64_CE=y
 CONFIG_CRYPTO_SHA2_ARM64_CE=y
 CONFIG_CRYPTO_SHA512_ARM64_CE=y
 CONFIG_CRYPTO_POLYVAL_ARM64_CE=y
--- a/arch/arm64/configs/microdroid_defconfig
+++ b/arch/arm64/configs/microdroid_defconfig
@@ -8,6 +8,8 @@ CONFIG_RCU_EXPERT=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_LOG_BUF_SHIFT=14
+CONFIG_CGROUPS=y
+CONFIG_MEMCG=y
 # CONFIG_RD_GZIP is not set
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
@@ -136,8 +138,10 @@ CONFIG_STATIC_USERMODEHELPER_PATH=""
 CONFIG_SECURITY_SELINUX=y
 CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y
 CONFIG_BUG_ON_DATA_CORRUPTION=y
+CONFIG_CRYPTO_SHA1=y
 CONFIG_CRYPTO_HCTR2=y
 CONFIG_CRYPTO_LZO=y
+CONFIG_CRYPTO_SHA1_ARM64_CE=y
 CONFIG_CRYPTO_SHA2_ARM64_CE=y
 CONFIG_CRYPTO_POLYVAL_ARM64_CE=y
 CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -83,6 +83,7 @@ enum __kvm_host_smccc_func {
 	__KVM_HOST_SMCCC_FUNC___pkvm_relax_perms,
 	__KVM_HOST_SMCCC_FUNC___pkvm_wrprotect,
 	__KVM_HOST_SMCCC_FUNC___pkvm_dirty_log,
+	__KVM_HOST_SMCCC_FUNC___pkvm_host_split_guest,
 	__KVM_HOST_SMCCC_FUNC___pkvm_tlb_flush_vmid,
 	__KVM_HOST_SMCCC_FUNC___kvm_adjust_pc,
 	__KVM_HOST_SMCCC_FUNC___kvm_vcpu_run,
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -224,20 +224,36 @@ struct kvm_smccc_features {
 };

 struct kvm_pinned_page {
+	union {
+		struct rb_node		node;
+		struct list_head	list_node;
+	};
 	struct page		*page;
 	u64			ipa;
+	u64			__subtree_last;
 	u8			order;
 	u16			pins;
 };

-#define KVM_DUMMY_PPAGE ((struct kvm_pinned_page *)-1)
+struct kvm_pinned_page
+*kvm_pinned_pages_iter_first(struct rb_root_cached *root, u64 start, u64 end);
+struct kvm_pinned_page
+*kvm_pinned_pages_iter_next(struct kvm_pinned_page *ppage, u64 start, u64 end);
+
+#define for_ppage_node_in_range(kvm, start, end, __ppage, __tmp)				\
+	for (__ppage = kvm_pinned_pages_iter_first(&(kvm)->arch.pkvm.pinned_pages, start, end - 1);\
+	    __ppage && ({ __tmp = kvm_pinned_pages_iter_next(__ppage, start, end - 1); 1; });	\
+	    __ppage = __tmp)
+
+void kvm_pinned_pages_remove(struct kvm_pinned_page *ppage,
+			     struct rb_root_cached *root);

 typedef unsigned int pkvm_handle_t;

 struct kvm_protected_vm {
 	pkvm_handle_t handle;
 	struct kvm_hyp_memcache stage2_teardown_mc;
-	struct maple_tree pinned_pages;
+	_ANDROID_KABI_REPLACE(struct maple_tree __unused, struct rb_root_cached pinned_pages);
 	gpa_t pvmfw_load_addr;
 	bool enabled;
 };
@@ -525,6 +541,7 @@ struct kvm_hyp_req {
 #define KVM_HYP_LAST_REQ	0
 #define KVM_HYP_REQ_TYPE_MEM	1
 #define KVM_HYP_REQ_TYPE_MAP	2
+#define KVM_HYP_REQ_TYPE_SPLIT	3
 	u8 type;
 	union {
 		struct {
@@ -539,6 +556,12 @@ struct kvm_hyp_req {
 			unsigned long	guest_ipa;
 			size_t		size;
 		} map;
+#ifndef __GENKSYMS__
+		struct {
+			unsigned long	guest_ipa;
+			size_t		size;
+		} split;
+#endif
 	};
 };

--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -184,6 +184,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,

 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu);
 int pkvm_mem_abort_range(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, size_t size);
+int __pkvm_pgtable_stage2_split(struct kvm_vcpu *vcpu, phys_addr_t ipa, size_t size);

 phys_addr_t kvm_mmu_get_httbr(void);
 phys_addr_t kvm_get_idmap_vector(void);
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -862,8 +862,7 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
 * kvm_pgtable_stage2_split() is best effort: it tries to break as many
 * blocks in the input range as allowed by @mc_capacity.
 */
-int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
-			     struct kvm_mmu_memory_cache *mc);
+int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, void *mc);

 /**
 * kvm_pgtable_walk() - Walk a page-table.
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -363,6 +363,11 @@ static int handle_hyp_req_map(struct kvm_vcpu *vcpu,
 	return pkvm_mem_abort_range(vcpu, req->map.guest_ipa, req->map.size);
 }

+static int handle_hyp_req_split(struct kvm_vcpu *vcpu, struct kvm_hyp_req *req)
+{
+	return __pkvm_pgtable_stage2_split(vcpu, req->split.guest_ipa, req->split.size);
+}
+
 static int handle_hyp_req(struct kvm_vcpu *vcpu)
 {
 	struct kvm_hyp_req *hyp_req = vcpu->arch.hyp_reqs;
@@ -379,6 +384,9 @@ static int handle_hyp_req(struct kvm_vcpu *vcpu)
 		case KVM_HYP_REQ_TYPE_MAP:
 			ret = handle_hyp_req_map(vcpu, hyp_req);
 			break;
+		case KVM_HYP_REQ_TYPE_SPLIT:
+			ret = handle_hyp_req_split(vcpu, hyp_req);
+			break;
 		default:
 			pr_warn("Unknown kvm_hyp_req type: %d\n", hyp_req->type);
 			ret = -EINVAL;
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -63,6 +63,7 @@ int __pkvm_host_unuse_dma(u64 phys_addr, size_t size);
 int __pkvm_guest_stage2_snapshot(struct kvm_pgtable_snapshot *snap, struct pkvm_hyp_vm *vm);
 int __pkvm_host_stage2_snapshot(struct kvm_pgtable_snapshot *snap);
 int __pkvm_host_lazy_pte(u64 pfn, u64 nr_pages, bool enable);
+int __pkvm_host_split_guest(u64 pfn, u64 gfn, u64 size, struct pkvm_hyp_vcpu *vcpu);

 bool addr_is_memory(phys_addr_t phys);
 int host_stage2_idmap_locked(phys_addr_t addr, u64 size,
--- a/arch/arm64/kvm/hyp/nvhe/alloc.c
+++ b/arch/arm64/kvm/hyp/nvhe/alloc.c
@@ -556,7 +556,7 @@ void *hyp_alloc(size_t size)
 	unsigned long chunk_addr;
 	int missing_map, ret = 0;

-	size = ALIGN(size, MIN_ALLOC);
+	size = ALIGN(size ?: MIN_ALLOC, MIN_ALLOC);

 	hyp_spin_lock(&allocator->lock);

--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -1073,6 +1073,27 @@ out:
 	cpu_reg(host_ctxt, 1) = ret;
 }

+static void handle___pkvm_host_split_guest(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(u64, pfn, host_ctxt, 1);
+	DECLARE_REG(u64, gfn, host_ctxt, 2);
+	DECLARE_REG(u64, size, host_ctxt, 3);
+	struct pkvm_hyp_vcpu *hyp_vcpu;
+	int ret = -EINVAL;
+
+	if (!is_protected_kvm_enabled())
+		goto out;
+
+	hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
+	if (!hyp_vcpu)
+		goto out;
+
+	ret = __pkvm_host_split_guest(pfn, gfn, size, hyp_vcpu);
+
+out:
+	cpu_reg(host_ctxt, 1) = ret;
+}
+
 static void handle___kvm_adjust_pc(struct kvm_cpu_context *host_ctxt)
 {
 	struct pkvm_hyp_vcpu *hyp_vcpu;
@@ -1618,6 +1639,7 @@ static const hcall_t host_hcall[] = {
 	HANDLE_FUNC(__pkvm_relax_perms),
 	HANDLE_FUNC(__pkvm_wrprotect),
 	HANDLE_FUNC(__pkvm_dirty_log),
+	HANDLE_FUNC(__pkvm_host_split_guest),
 	HANDLE_FUNC(__pkvm_tlb_flush_vmid),
 	HANDLE_FUNC(__kvm_adjust_pc),
 	HANDLE_FUNC(__kvm_vcpu_run),
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -387,6 +387,10 @@ static int relinquish_walker(const struct kvm_pgtable_visit_ctx *ctx,
 	if (!kvm_pte_valid(pte))
 		return 0;

+	/* We don't support splitting non-leaf mappings */
+	if (ctx->level != (KVM_PGTABLE_MAX_LEVELS - 1))
+		return -E2BIG;
+
 	state = pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte));
 	if (state != data->expected_state)
 		return -EPERM;
@@ -433,8 +437,7 @@ int __pkvm_guest_relinquish_to_host(struct pkvm_hyp_vcpu *vcpu,
 		goto end;

 	/* Zap the guest stage2 pte and return ownership to the host */
-	ret = kvm_pgtable_stage2_annotate(&vm->pgt, ipa, PAGE_SIZE,
-					  &vcpu->vcpu.arch.stage2_mc, 0);
+	ret = kvm_pgtable_stage2_unmap(&vm->pgt, ipa, PAGE_SIZE);
 	if (ret)
 		goto end;

@@ -2760,6 +2763,30 @@ unlock:

 }

+int __pkvm_host_split_guest(u64 pfn, u64 gfn, u64 size, struct pkvm_hyp_vcpu *vcpu)
+{
+	struct kvm_hyp_memcache *mc = &vcpu->vcpu.arch.stage2_mc;
+	struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+	u64 ipa = hyp_pfn_to_phys(gfn);
+	int ret;
+
+	if (size != PMD_SIZE)
+		return -EINVAL;
+
+	guest_lock_component(vm);
+
+	/*
+	 * stage2_split() already checks the existing mapping is valid and PMD-level.
+	 * No other check is necessary.
+	 */
+
+	ret = kvm_pgtable_stage2_split(&vm->pgt, ipa, size, mc);
+
+	guest_unlock_component(vm);
+
+	return ret;
+}
+
 int __pkvm_host_donate_guest(struct pkvm_hyp_vcpu *vcpu, u64 pfn, u64 gfn,
 			     u64 nr_pages)
 {
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -702,16 +702,13 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu,
 	if (ret)
 		goto done;

-	ret = pkvm_vcpu_init_psci(hyp_vcpu);
-	if (ret)
-		goto done;
-
 	if (test_bit(KVM_ARM_VCPU_SVE, hyp_vcpu->vcpu.arch.features)) {
 		ret = init_pkvm_hyp_vcpu_sve(hyp_vcpu, host_vcpu);
 		if (ret)
 			goto done;
 	}

+	WARN_ON(pkvm_vcpu_init_psci(hyp_vcpu));
 	pkvm_vcpu_init_traps(hyp_vcpu);
 	kvm_reset_pvm_sys_regs(&hyp_vcpu->vcpu);
 done:
@@ -1588,9 +1585,19 @@ static bool pkvm_memrelinquish_call(struct pkvm_hyp_vcpu *hyp_vcpu,
 		goto out_guest_err;

 	ret = __pkvm_guest_relinquish_to_host(hyp_vcpu, ipa, &pa);
-	if (ret == -ENOMEM) {
-		if (pkvm_handle_empty_memcache(hyp_vcpu, exit_code))
+	if (ret == -E2BIG) {
+		struct kvm_hyp_req *req = pkvm_hyp_req_reserve(hyp_vcpu, KVM_HYP_REQ_TYPE_SPLIT);
+
+		if (!req) {
+			ret = -ENOMEM;
 			goto out_guest_err;
+		}
+
+		req->split.guest_ipa = ALIGN_DOWN(ipa, PMD_SIZE);
+		req->split.size = PMD_SIZE;
+
+		write_sysreg_el2(read_sysreg_el2(SYS_ELR) - 4, SYS_ELR);
+		*exit_code = ARM_EXCEPTION_HYP_REQ;

 		return false;
 	} else if (ret) {
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -1769,13 +1769,49 @@ static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
 	return 0;
 }

-int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
-			     struct kvm_mmu_memory_cache *mc)
+static int pkvm_stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
+				    enum kvm_pgtable_walk_flags visit)
 {
+	struct stage2_map_data *data = ctx->arg;
+	struct kvm_pgtable *pgt = data->mmu->pgt;
+	struct kvm_hyp_memcache *mc = data->memcache;
+	enum kvm_pgtable_prot prot;
+	kvm_pte_t pte = ctx->old;
+	kvm_pte_t *childp;
+
+	if (ctx->level == KVM_PGTABLE_MAX_LEVELS - 1)
+		return 0;
+
+	/* We can only split PMD-level blocks */
+	if (!kvm_pte_valid(pte) || ctx->level != KVM_PGTABLE_MAX_LEVELS - 2)
+		return -EINVAL;
+
+	prot = kvm_pgtable_stage2_pte_prot(pte);
+	childp = kvm_pgtable_stage2_create_unlinked(pgt, kvm_pte_to_phys(pte),
+						    ctx->level, prot, mc, true);
+	if (IS_ERR(childp))
+		return PTR_ERR(childp);
+
+	WARN_ON(!stage2_try_break_pte(ctx, data->mmu));
+
+	stage2_make_pte(ctx, kvm_init_table_pte(childp, ctx->mm_ops));
+	dsb(ishst);
+
+	return 0;
+}
+
+int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, void *mc)
+{
+	struct stage2_map_data data = {
+		.mmu		= pgt->mmu,
+		.memcache	= mc,
+	};
 	struct kvm_pgtable_walker walker = {
-		.cb	= stage2_split_walker,
+		.cb	= static_branch_unlikely(&kvm_protected_mode_initialized) ?
+				pkvm_stage2_split_walker : stage2_split_walker,
+		.arg	= static_branch_unlikely(&kvm_protected_mode_initialized) ?
+				&data : mc,
 		.flags	= KVM_PGTABLE_WALK_LEAF,
-		.arg	= mc,
 	};

 	return kvm_pgtable_walk(pgt, addr, size, &walker);
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -6,11 +6,11 @@

 #include <linux/cma.h>
 #include <linux/dma-map-ops.h>
-#include <linux/maple_tree.h>
 #include <linux/mman.h>
 #include <linux/kvm_host.h>
 #include <linux/io.h>
 #include <linux/hugetlb.h>
+#include <linux/interval_tree_generic.h>
 #include <linux/sched/signal.h>
 #include <trace/events/kvm.h>
 #include <asm/pgalloc.h>
@@ -291,6 +291,20 @@ static void invalidate_icache_guest_page(void *va, size_t size)
 	__invalidate_icache_guest_page(va, size);
 }

+static u64 __pinned_page_start(struct kvm_pinned_page *ppage)
+{
+	return ppage->ipa;
+}
+
+static u64 __pinned_page_end(struct kvm_pinned_page *ppage)
+{
+	return ppage->ipa + (1 << (ppage->order + PAGE_SHIFT)) - 1;
+}
+
+INTERVAL_TREE_DEFINE(struct kvm_pinned_page, node, u64, __subtree_last,
+		     __pinned_page_start, __pinned_page_end, /* empty */,
+		     kvm_pinned_pages);
+
 static int __pkvm_unmap_guest_call(u64 pfn, u64 gfn, u8 order, void *args)
 {
 	struct kvm *kvm = args;
@@ -312,7 +326,7 @@ static int pkvm_unmap_guest(struct kvm *kvm, struct kvm_pinned_page *ppage)
 	 * no update needed from here.
 	 */
 	unpin_user_pages(&ppage->page, 1);
-	mtree_erase(&kvm->arch.pkvm.pinned_pages, ppage->ipa);
+	kvm_pinned_pages_remove(ppage, &kvm->arch.pkvm.pinned_pages);
 	kfree(ppage);

 	return 0;
@@ -320,17 +334,12 @@ static int pkvm_unmap_guest(struct kvm *kvm, struct kvm_pinned_page *ppage)

 static int pkvm_unmap_range(struct kvm *kvm, u64 start, u64 end)
 {
+	struct kvm_pinned_page *ppage, *tmp;
 	struct mm_struct *mm = kvm->mm;
-	unsigned long index = start;
 	unsigned long cnt = 0;
-	void *entry;
 	int ret = 0;

-	mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, end - 1) {
-		struct kvm_pinned_page *ppage = entry;
-
-		if (ppage == KVM_DUMMY_PPAGE)
-			continue;
+	for_ppage_node_in_range(kvm, start, end, ppage, tmp) {
 		ret = pkvm_unmap_guest(kvm, ppage);
 		if (ret)
 			break;
@@ -418,8 +427,7 @@ static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 si

 static void pkvm_stage2_flush(struct kvm *kvm)
 {
-	unsigned long index = 0;
-	void *entry;
+	struct kvm_pinned_page *ppage, *tmp;

 	/*
 	 * Contrary to stage2_apply_range(), we don't need to check
@@ -427,11 +435,7 @@ static void pkvm_stage2_flush(struct kvm *kvm)
 	 * from a vcpu thread, and the list is only ever freed on VM
 	 * destroy (which only occurs when all vcpu are gone).
 	 */
-	mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, ULONG_MAX) {
-		struct kvm_pinned_page *ppage = entry;
-
-		if (ppage == KVM_DUMMY_PPAGE)
-			continue;
+	for_ppage_node_in_range(kvm, 0, ULONG_MAX, ppage, tmp) {
 		__clean_dcache_guest_page(page_address(ppage->page), PAGE_SIZE);
 		cond_resched_rwlock_write(&kvm->mmu_lock);
 	}
@@ -1014,7 +1018,6 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
 	mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
 	mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
 	kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
-	mt_init_flags(&kvm->arch.pkvm.pinned_pages, MT_FLAGS_USE_RCU);
 	mmu->arch = &kvm->arch;

 	if (is_protected_kvm_enabled())
@@ -1293,18 +1296,13 @@ static int __pkvm_wrprotect_call(u64 pfn, u64 gfn, u8 order, void *args)

 static int pkvm_wp_range(struct kvm *kvm, u64 start, u64 end)
 {
-	unsigned long index = start;
-	void *entry;
+	struct kvm_pinned_page *ppage, *tmp;

-	mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, end - 1) {
-		struct kvm_pinned_page *ppage = entry;
+	for_ppage_node_in_range(kvm, start, end, ppage, tmp) {
 		int ret;

-		if (ppage == KVM_DUMMY_PPAGE)
-			continue;
 		ret = pkvm_call_hyp_nvhe_ppage(ppage, __pkvm_wrprotect_call,
 						   kvm, false);
-
 		if (ret)
 			return ret;
 	}
@@ -1630,28 +1628,9 @@ static int pkvm_host_map_guest(u64 pfn, u64 gfn, u64 nr_pages,
 	return (ret == -EPERM) ? -EAGAIN : ret;
 }

-static struct kvm_pinned_page *
-find_ppage_or_above(struct kvm *kvm, phys_addr_t ipa)
-{
-	unsigned long index = ipa;
-	void *entry;
-
-	mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, ULONG_MAX) {
-		if (entry == KVM_DUMMY_PPAGE)
-			continue;
-		return entry;
-	}
-
-	return NULL;
-}
-
 static struct kvm_pinned_page *find_ppage(struct kvm *kvm, u64 ipa)
 {
-	struct kvm_pinned_page *ppage;
-	unsigned long index = ipa;
-
-	ppage = mt_find(&kvm->arch.pkvm.pinned_pages, &index, ipa + PAGE_SIZE - 1);
-	return ppage == KVM_DUMMY_PPAGE ? NULL : ppage;
+	return kvm_pinned_pages_iter_first(&kvm->arch.pkvm.pinned_pages, ipa, ipa + PAGE_SIZE - 1);
 }

 static int __pkvm_relax_perms_call(u64 pfn, u64 gfn, u8 order, void *args)
@@ -1707,11 +1686,10 @@ static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t *fault_ipa,
 {
 	unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE;
 	struct kvm_hyp_memcache *hyp_memcache = &vcpu->arch.stage2_mc;
-	unsigned long index, pmd_offset, page_size, end;
+	unsigned long page_size = PAGE_SIZE;
 	struct mm_struct *mm = current->mm;
 	struct kvm_pinned_page *ppage;
 	struct kvm *kvm = vcpu->kvm;
-	struct maple_tree *mt = &kvm->arch.pkvm.pinned_pages;
 	int ret, nr_pages;
 	struct page *page;
 	u64 pfn;
@@ -1760,66 +1738,49 @@ static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t *fault_ipa,
 	}

 	pfn = page_to_pfn(page);
-	pmd_offset = *fault_ipa & (PMD_SIZE - 1);
-	page_size = transparent_hugepage_adjust(kvm, memslot,
-						hva, &pfn,
-						fault_ipa);
-	page = pfn_to_page(pfn);

-retry:
-	if (size)
-		*size = page_size;
+	read_lock(&kvm->mmu_lock);
+	if (!kvm_pinned_pages_iter_first(&vcpu->kvm->arch.pkvm.pinned_pages,
+					 ALIGN_DOWN(*fault_ipa, PMD_SIZE),
+					 ALIGN(*fault_ipa + 1, PMD_SIZE) - 1))
+		page_size = transparent_hugepage_adjust(kvm, memslot, hva, &pfn, fault_ipa);
+
+	/*
+	 * We take the risk of racing with another vCPU, but sync will be restored by the
+	 * host_map_guest HVC
+	 */
+	read_unlock(&kvm->mmu_lock);
+
+	page = pfn_to_page(pfn);

 	ret = account_locked_vm(mm, page_size >> PAGE_SHIFT, true);
 	if (ret)
 		goto unpin;

-	index = *fault_ipa;
-	end = index + page_size - 1;
 	ppage->page = page;
 	ppage->ipa = *fault_ipa;
 	ppage->order = get_order(page_size);
 	ppage->pins = 1 << ppage->order;

-	/*
-	 * If we already have a mapping in the middle of the THP, we have no
-	 * other choice than enforcing PAGE_SIZE for pkvm_host_map_guest() to
-	 * succeed.
-	 */
-	if (page_size > PAGE_SIZE && mt_find(mt, &index, end)) {
-		*fault_ipa += pmd_offset;
-		pfn += pmd_offset >> PAGE_SHIFT;
-		page = pfn_to_page(pfn);
-		account_locked_vm(mm, page_size >> PAGE_SHIFT, false);
-		page_size = PAGE_SIZE;
-		goto retry;
-	}
-
-	/* Reserve space in the mtree */
-	ret = mtree_insert_range(mt, index, end, KVM_DUMMY_PPAGE, GFP_KERNEL);
-	if (ret) {
-		if (ret == -EEXIST)
-			ret = 0;
-		goto dec_account;
-	}
-
 	write_lock(&kvm->mmu_lock);
 	ret = pkvm_host_map_guest(pfn, *fault_ipa >> PAGE_SHIFT,
 				  page_size >> PAGE_SHIFT, KVM_PGTABLE_PROT_R);
 	if (ret) {
-		if (WARN_ON(ret == -EAGAIN))
+		if (ret == -EAGAIN)
 			ret = 0;

 		goto err_unlock;
 	}
-	WARN_ON(mtree_store_range(mt, index, end, ppage, GFP_ATOMIC));
+	kvm_pinned_pages_insert(ppage, &kvm->arch.pkvm.pinned_pages);
 	write_unlock(&kvm->mmu_lock);

+	if (size)
+		*size = page_size;
+
 	return 0;

 err_unlock:
 	write_unlock(&kvm->mmu_lock);
-dec_account:
 	account_locked_vm(mm, page_size >> PAGE_SHIFT, false);
 unpin:
 	unpin_user_pages(&page, 1);
@@ -1847,13 +1808,13 @@ int pkvm_mem_abort_range(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, size_t si
 	idx = srcu_read_lock(&vcpu->kvm->srcu);

 	read_lock(&vcpu->kvm->mmu_lock);
-	ppage = find_ppage_or_above(vcpu->kvm, fault_ipa);
+	ppage = kvm_pinned_pages_iter_first(&vcpu->kvm->arch.pkvm.pinned_pages,
+					    fault_ipa, ipa_end);

 	while (fault_ipa < ipa_end) {
-		if (ppage && ppage != KVM_DUMMY_PPAGE && ppage->ipa == fault_ipa) {
+		if (ppage && ppage->ipa == fault_ipa) {
 			page_size = PAGE_SIZE << ppage->order;
-			ppage = mt_next(&vcpu->kvm->arch.pkvm.pinned_pages,
-					ppage->ipa, ULONG_MAX);
+			ppage = kvm_pinned_pages_iter_next(ppage, fault_ipa, ipa_end);
 		} else {
 			gfn_t gfn = gpa_to_gfn(fault_ipa);
 			struct kvm_memory_slot *memslot;
@@ -1877,7 +1838,8 @@ int pkvm_mem_abort_range(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, size_t si
 			 * We had to release the mmu_lock so let's update the
 			 * reference.
 			 */
-			ppage = find_ppage_or_above(vcpu->kvm, fault_ipa + page_size);
+			ppage = kvm_pinned_pages_iter_first(&vcpu->kvm->arch.pkvm.pinned_pages,
+							    fault_ipa + PAGE_SIZE, ipa_end);
 		}

 		fault_ipa += page_size;
@@ -1889,6 +1851,162 @@ end:
 	return err;
 }

+static int __pkvm_pin_user_pages(struct kvm *kvm, struct kvm_memory_slot *memslot,
+				 u64 gfn, u64 nr_pages, struct page ***__pages)
+{
+	unsigned long hva = gfn_to_hva_memslot_prot(memslot, gfn, NULL);
+	unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE;
+	struct mm_struct *mm = current->mm;
+	struct page **pages;
+	long ret;
+	int p;
+
+	pages = kmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	mmap_read_lock(mm);
+	ret = pin_user_pages(hva, nr_pages, flags, pages);
+	mmap_read_unlock(mm);
+
+	if (ret == -EHWPOISON) {
+		kvm_send_hwpoison_signal(hva, PAGE_SHIFT);
+		goto err_free_pages;
+	} else if (ret == -EFAULT) {
+		/* Will try MMIO map */
+		ret = -EREMOTEIO;
+		goto err_free_pages;
+	} else if (ret < 0) {
+		ret = -EFAULT;
+		goto err_free_pages;
+	} else if (ret != nr_pages) {
+		nr_pages = ret;
+		ret = -EFAULT;
+		goto err_unpin_pages;
+	}
+
+	/* See PageSwapBacked() in pkvm_mem_abort() */
+	for (p = 0; p < nr_pages; p++) {
+		if (!folio_test_swapbacked(page_folio(pages[p]))) {
+			ret = -EIO;
+			goto err_unpin_pages;
+		}
+	}
+
+	*__pages = pages;
+	return 0;
+
+err_unpin_pages:
+	unpin_user_pages(pages, nr_pages);
+err_free_pages:
+	kfree(pages);
+	return ret;
+}
+
+/*
+ * Splitting is only expected on the back of a relinquish guest HVC in the pKVM case, while
+ * pkvm_pgtable_stage2_split() can be called with dirty logging.
+ */
+int __pkvm_pgtable_stage2_split(struct kvm_vcpu *vcpu, phys_addr_t ipa, size_t size)
+{
+	struct list_head ppage_prealloc = LIST_HEAD_INIT(ppage_prealloc);
+	struct kvm_hyp_memcache *hyp_memcache = &vcpu->arch.stage2_mc;
+	struct kvm_pinned_page *ppage, *tmp;
+	struct kvm_memory_slot *memslot;
+	struct kvm *kvm = vcpu->kvm;
+	int idx, p, ret, nr_pages;
+	struct page **pages;
+	kvm_pfn_t pfn;
+	gfn_t gfn;
+
+	if (!IS_ALIGNED(ipa, PMD_SIZE) || size != PMD_SIZE)
+		return -EINVAL;
+
+	if (!hyp_memcache->nr_pages) {
+		ret = topup_hyp_memcache(hyp_memcache, 1, 0);
+		if (ret)
+			return -ENOMEM;
+
+		atomic64_add(PAGE_SIZE, &kvm->stat.protected_hyp_mem);
+		atomic64_add(PAGE_SIZE, &kvm->stat.protected_pgtable_mem);
+	}
+
+	/* We already have 1 pin on the Huge Page */
+	nr_pages = (size >> PAGE_SHIFT) - 1;
+	gfn = (ipa >> PAGE_SHIFT) + 1;
+
+	/* Pre-allocate kvm_pinned_page before acquiring the mmu_lock */
+	for (p = 0; p < nr_pages; p++) {
+		ppage = kzalloc(sizeof(*ppage), GFP_KERNEL_ACCOUNT);
+		if (!ppage) {
+			ret = -ENOMEM;
+			goto free_pinned_pages;
+		}
+		list_add(&ppage->list_node, &ppage_prealloc);
+	}
+
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+	memslot = gfn_to_memslot(vcpu->kvm, gfn);
+	ret = __pkvm_pin_user_pages(kvm, memslot, gfn, nr_pages, &pages);
+	if (ret)
+		goto unlock_srcu;
+
+	write_lock(&kvm->mmu_lock);
+
+	ppage = find_ppage(kvm, ipa);
+	if (!ppage) {
+		ret = -EPERM;
+		goto end;
+	} else if (!ppage->order) {
+		ret = 0;
+		goto end;
+	}
+
+	ret = kvm_call_hyp_nvhe(__pkvm_host_split_guest, page_to_pfn(ppage->page),
+				ipa >> PAGE_SHIFT, size);
+	if (ret)
+		goto end;
+
+	ppage->order = 0;
+	ppage->pins = 1;
+
+	pfn = page_to_pfn(ppage->page) + 1;
+	ipa = ipa + PAGE_SIZE;
+	while (nr_pages--) {
+		/* Pop a ppage from the pre-allocated list */
+		ppage = list_first_entry(&ppage_prealloc, struct kvm_pinned_page, list_node);
+		list_del_init(&ppage->list_node);
+
+		ppage->page = pfn_to_page(pfn);
+		ppage->ipa = ipa;
+		ppage->order = 0;
+		ppage->pins = 1;
+		kvm_pinned_pages_insert(ppage, &kvm->arch.pkvm.pinned_pages);
+
+		pfn += 1;
+		ipa += PAGE_SIZE;
+	}
+
+end:
+	write_unlock(&kvm->mmu_lock);
+
+	if (ret)
+		unpin_user_pages(pages, nr_pages);
+	kfree(pages);
+
+unlock_srcu:
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+free_pinned_pages:
+	/* Free unused pre-allocated kvm_pinned_page */
+	list_for_each_entry_safe(ppage, tmp, &ppage_prealloc, list_node) {
+		list_del(&ppage->list_node);
+		kfree(ppage);
+	}
+
+	return ret;
+}
+
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			  struct kvm_memory_slot *memslot, unsigned long hva,
 			  unsigned long fault_status)
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -319,21 +319,17 @@ static int __reclaim_dying_guest_page_call(u64 pfn, u64 gfn, u8 order, void *arg

 static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm)
 {
+	struct kvm_pinned_page *tmp, *ppage;
 	struct mm_struct *mm = current->mm;
-	struct kvm_pinned_page *ppage;
 	struct kvm_vcpu *host_vcpu;
-	unsigned long idx, ipa = 0;
+	unsigned long idx;

 	if (!host_kvm->arch.pkvm.handle)
 		goto out_free;

 	WARN_ON(kvm_call_hyp_nvhe(__pkvm_start_teardown_vm, host_kvm->arch.pkvm.handle));

-	mt_clear_in_rcu(&host_kvm->arch.pkvm.pinned_pages);
-
-	mt_for_each(&host_kvm->arch.pkvm.pinned_pages, ppage, ipa, ULONG_MAX) {
-		if (WARN_ON(ppage == KVM_DUMMY_PPAGE))
-			continue;
+	for_ppage_node_in_range(host_kvm, 0, ULONG_MAX, ppage, tmp) {
 		WARN_ON(pkvm_call_hyp_nvhe_ppage(ppage,
 						 __reclaim_dying_guest_page_call,
 						 host_kvm, true));
@@ -341,9 +337,9 @@ static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm)

 		account_locked_vm(mm, 1, false);
 		unpin_user_pages_dirty_lock(&ppage->page, 1, host_kvm->arch.pkvm.enabled);
+		kvm_pinned_pages_remove(ppage, &host_kvm->arch.pkvm.pinned_pages);
 		kfree(ppage);
 	}
-	mtree_destroy(&host_kvm->arch.pkvm.pinned_pages);

 	WARN_ON(kvm_call_hyp_nvhe(__pkvm_finalize_teardown_vm, host_kvm->arch.pkvm.handle));

@@ -538,21 +534,21 @@ void pkvm_host_reclaim_page(struct kvm *host_kvm, phys_addr_t ipa)
 {
 	struct mm_struct *mm = current->mm;
 	struct kvm_pinned_page *ppage;
-	unsigned long index = ipa;
 	u16 pins;

 	write_lock(&host_kvm->mmu_lock);
-	ppage = mt_find(&host_kvm->arch.pkvm.pinned_pages, &index,
-			index + PAGE_SIZE - 1);
-	if (ppage && ppage != KVM_DUMMY_PPAGE) {
+	ppage = kvm_pinned_pages_iter_first(&host_kvm->arch.pkvm.pinned_pages,
+					    ipa, ipa + PAGE_SIZE - 1);
+	if (ppage) {
+		WARN_ON_ONCE(ppage->pins != 1);
+
 		if (ppage->pins)
 			ppage->pins--;
-		else
-			WARN_ON(1);

 		pins = ppage->pins;
 		if (!pins)
-			mtree_erase(&host_kvm->arch.pkvm.pinned_pages, ipa);
+			kvm_pinned_pages_remove(ppage,
+						&host_kvm->arch.pkvm.pinned_pages);
 	}
 	write_unlock(&host_kvm->mmu_lock);

--- a/arch/x86/configs/gki_defconfig
+++ b/arch/x86/configs/gki_defconfig
@@ -672,6 +672,7 @@ CONFIG_CRYPTO_ZSTD=y
 CONFIG_CRYPTO_ANSI_CPRNG=y
 CONFIG_CRYPTO_AES_NI_INTEL=y
 CONFIG_CRYPTO_POLYVAL_CLMUL_NI=y
+CONFIG_CRYPTO_SHA1_SSSE3=y
 CONFIG_CRYPTO_SHA256_SSSE3=y
 CONFIG_CRYPTO_SHA512_SSSE3=y
 CONFIG_CRC_CCITT=y
--- a/arch/x86/configs/microdroid_defconfig
+++ b/arch/x86/configs/microdroid_defconfig
@@ -14,12 +14,6 @@ CONFIG_UCLAMP_TASK=y
 CONFIG_UCLAMP_BUCKETS_COUNT=20
 CONFIG_CGROUPS=y
 CONFIG_MEMCG=y
-CONFIG_BLK_CGROUP=y
-CONFIG_CGROUP_SCHED=y
-CONFIG_UCLAMP_TASK_GROUP=y
-CONFIG_CGROUP_FREEZER=y
-CONFIG_CPUSETS=y
-CONFIG_CGROUP_CPUACCT=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
 # CONFIG_RD_XZ is not set
@@ -47,7 +41,6 @@ CONFIG_CPU_FREQ_GOV_POWERSAVE=y
 CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y
 CONFIG_JUMP_LABEL=y
 # CONFIG_BLOCK_LEGACY_AUTOLOAD is not set
-CONFIG_BLK_CGROUP_IOCOST=y
 CONFIG_PARTITION_ADVANCED=y
 # CONFIG_MSDOS_PARTITION is not set
 # CONFIG_MQ_IOSCHED_DEADLINE is not set
@@ -209,6 +202,7 @@ CONFIG_CRYPTO_HCTR2=y
 CONFIG_CRYPTO_LZO=y
 CONFIG_CRYPTO_AES_NI_INTEL=y
 CONFIG_CRYPTO_POLYVAL_CLMUL_NI=y
+CONFIG_CRYPTO_SHA1_SSSE3=y
 CONFIG_CRYPTO_SHA256_SSSE3=y
 CONFIG_CRYPTO_SHA512_SSSE3=y
 CONFIG_PRINTK_TIME=y
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -6642,10 +6642,10 @@ static void print_binder_transaction_ilocked(struct seq_file *m,
 }

 static void print_binder_work_ilocked(struct seq_file *m,
-				     struct binder_proc *proc,
-				     const char *prefix,
-				     const char *transaction_prefix,
-				     struct binder_work *w)
+				      struct binder_proc *proc,
+				      const char *prefix,
+				      const char *transaction_prefix,
+				      struct binder_work *w, bool hash_ptrs)
 {
 	struct binder_node *node;
 	struct binder_transaction *t;
@@ -6668,9 +6668,15 @@ static void print_binder_work_ilocked(struct seq_file *m,
 		break;
 	case BINDER_WORK_NODE:
 		node = container_of(w, struct binder_node, work);
-		seq_printf(m, "%snode work %d: u%016llx c%016llx\n",
-			   prefix, node->debug_id,
-			   (u64)node->ptr, (u64)node->cookie);
+		if (hash_ptrs)
+			seq_printf(m, "%snode work %d: u%p c%p\n",
+				   prefix, node->debug_id,
+				   (void *)(long)node->ptr,
+				   (void *)(long)node->cookie);
+		else
+			seq_printf(m, "%snode work %d: u%016llx c%016llx\n",
+				   prefix, node->debug_id,
+				   (u64)node->ptr, (u64)node->cookie);
 		break;
 	case BINDER_WORK_DEAD_BINDER:
 		seq_printf(m, "%shas dead binder\n", prefix);
@@ -6695,7 +6701,7 @@ static void print_binder_work_ilocked(struct seq_file *m,

 static void print_binder_thread_ilocked(struct seq_file *m,
 					struct binder_thread *thread,
-					int print_always)
+					bool print_always, bool hash_ptrs)
 {
 	struct binder_transaction *t;
 	struct binder_work *w;
@@ -6725,14 +6731,16 @@ static void print_binder_thread_ilocked(struct seq_file *m,
 	}
 	list_for_each_entry(w, &thread->todo, entry) {
 		print_binder_work_ilocked(m, thread->proc, "    ",
-					  "    pending transaction", w);
+					  "    pending transaction",
+					  w, hash_ptrs);
 	}
 	if (!print_always && m->count == header_pos)
 		m->count = start_pos;
 }

 static void print_binder_node_nilocked(struct seq_file *m,
-				       struct binder_node *node)
+				       struct binder_node *node,
+				       bool hash_ptrs)
 {
 	struct binder_ref *ref;
 	struct binder_work *w;
@@ -6742,8 +6750,13 @@ static void print_binder_node_nilocked(struct seq_file *m,
 	hlist_for_each_entry(ref, &node->refs, node_entry)
 		count++;

-	seq_printf(m, "  node %d: u%016llx c%016llx pri %d:%d hs %d hw %d ls %d lw %d is %d iw %d tr %d",
-		   node->debug_id, (u64)node->ptr, (u64)node->cookie,
+	if (hash_ptrs)
+		seq_printf(m, "  node %d: u%p c%p", node->debug_id,
+			   (void *)(long)node->ptr, (void *)(long)node->cookie);
+	else
+		seq_printf(m, "  node %d: u%016llx c%016llx", node->debug_id,
+			   (u64)node->ptr, (u64)node->cookie);
+	seq_printf(m, " pri %d:%d hs %d hw %d ls %d lw %d is %d iw %d tr %d",
 		   node->sched_policy, node->min_priority,
 		   node->has_strong_ref, node->has_weak_ref,
 		   node->local_strong_refs, node->local_weak_refs,
@@ -6757,7 +6770,8 @@ static void print_binder_node_nilocked(struct seq_file *m,
 	if (node->proc) {
 		list_for_each_entry(w, &node->async_todo, entry)
 			print_binder_work_ilocked(m, node->proc, "    ",
-					  "    pending async transaction", w);
+					  "    pending async transaction",
+					  w, hash_ptrs);
 	}
 }

@@ -6773,8 +6787,54 @@ static void print_binder_ref_olocked(struct seq_file *m,
 	binder_node_unlock(ref->node);
 }

-static void print_binder_proc(struct seq_file *m,
-			      struct binder_proc *proc, int print_all)
+/**
+ * print_next_binder_node_ilocked() - Print binder_node from a locked list
+ * @m:          struct seq_file for output via seq_printf()
+ * @proc:       struct binder_proc we hold the inner_proc_lock to (if any)
+ * @node:       struct binder_node to print fields of
+ * @prev_node:	struct binder_node we hold a temporary reference to (if any)
+ * @hash_ptrs:  whether to hash @node's binder_uintptr_t fields
+ *
+ * Helper function to handle synchronization around printing a struct
+ * binder_node while iterating through @proc->nodes or the dead nodes list.
+ * Caller must hold either @proc->inner_lock (for live nodes) or
+ * binder_dead_nodes_lock. This lock will be released during the body of this
+ * function, but it will be reacquired before returning to the caller.
+ *
+ * Return:	pointer to the struct binder_node we hold a tmpref on
+ */
+static struct binder_node *
+print_next_binder_node_ilocked(struct seq_file *m, struct binder_proc *proc,
+			       struct binder_node *node,
+			       struct binder_node *prev_node, bool hash_ptrs)
+{
+	/*
+	 * Take a temporary reference on the node so that isn't freed while
+	 * we print it.
+	 */
+	binder_inc_node_tmpref_ilocked(node);
+	/*
+	 * Live nodes need to drop the inner proc lock and dead nodes need to
+	 * drop the binder_dead_nodes_lock before trying to take the node lock.
+	 */
+	if (proc)
+		binder_inner_proc_unlock(proc);
+	else
+		spin_unlock(&binder_dead_nodes_lock);
+	if (prev_node)
+		binder_put_node(prev_node);
+	binder_node_inner_lock(node);
+	print_binder_node_nilocked(m, node, hash_ptrs);
+	binder_node_inner_unlock(node);
+	if (proc)
+		binder_inner_proc_lock(proc);
+	else
+		spin_lock(&binder_dead_nodes_lock);
+	return node;
+}
+
+static void print_binder_proc(struct seq_file *m, struct binder_proc *proc,
+			      bool print_all, bool hash_ptrs)
 {
 	struct binder_work *w;
 	struct rb_node *n;
@@ -6787,31 +6847,19 @@ static void print_binder_proc(struct seq_file *m,
 	header_pos = m->count;

 	binder_inner_proc_lock(proc);
-	for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n))
+	for (n = rb_first(&proc->threads); n; n = rb_next(n))
 		print_binder_thread_ilocked(m, rb_entry(n, struct binder_thread,
-						rb_node), print_all);
+						rb_node), print_all, hash_ptrs);

-	for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) {
+	for (n = rb_first(&proc->nodes); n; n = rb_next(n)) {
 		struct binder_node *node = rb_entry(n, struct binder_node,
 						    rb_node);
 		if (!print_all && !node->has_async_transaction)
 			continue;

-		/*
-		 * take a temporary reference on the node so it
-		 * survives and isn't removed from the tree
-		 * while we print it.
-		 */
-		binder_inc_node_tmpref_ilocked(node);
-		/* Need to drop inner lock to take node lock */
-		binder_inner_proc_unlock(proc);
-		if (last_node)
-			binder_put_node(last_node);
-		binder_node_inner_lock(node);
-		print_binder_node_nilocked(m, node);
-		binder_node_inner_unlock(node);
-		last_node = node;
-		binder_inner_proc_lock(proc);
+		last_node = print_next_binder_node_ilocked(m, proc, node,
+							   last_node,
+							   hash_ptrs);
 	}
 	binder_inner_proc_unlock(proc);
 	if (last_node)
@@ -6819,19 +6867,18 @@ static void print_binder_proc(struct seq_file *m,

 	if (print_all) {
 		binder_proc_lock(proc);
-		for (n = rb_first(&proc->refs_by_desc);
-		     n != NULL;
-		     n = rb_next(n))
+		for (n = rb_first(&proc->refs_by_desc); n; n = rb_next(n))
 			print_binder_ref_olocked(m, rb_entry(n,
-							    struct binder_ref,
-							    rb_node_desc));
+							     struct binder_ref,
+							     rb_node_desc));
 		binder_proc_unlock(proc);
 	}
 	binder_alloc_print_allocated(m, &proc->alloc);
 	binder_inner_proc_lock(proc);
 	list_for_each_entry(w, &proc->todo, entry)
 		print_binder_work_ilocked(m, proc, "  ",
-					  "  pending transaction", w);
+					  "  pending transaction", w,
+					  hash_ptrs);
 	list_for_each_entry(w, &proc->delivered_death, entry) {
 		seq_puts(m, "  has delivered dead binder\n");
 		break;
@@ -6958,7 +7005,7 @@ static void print_binder_proc_stats(struct seq_file *m,
 	count = 0;
 	ready_threads = 0;
 	binder_inner_proc_lock(proc);
-	for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n))
+	for (n = rb_first(&proc->threads); n; n = rb_next(n))
 		count++;

 	list_for_each_entry(thread, &proc->waiting_threads, waiting_thread_node)
@@ -6972,7 +7019,7 @@ static void print_binder_proc_stats(struct seq_file *m,
 			ready_threads,
 			free_async_space);
 	count = 0;
-	for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n))
+	for (n = rb_first(&proc->nodes); n; n = rb_next(n))
 		count++;
 	binder_inner_proc_unlock(proc);
 	seq_printf(m, "  nodes: %d\n", count);
@@ -6980,7 +7027,7 @@ static void print_binder_proc_stats(struct seq_file *m,
 	strong = 0;
 	weak = 0;
 	binder_proc_lock(proc);
-	for (n = rb_first(&proc->refs_by_desc); n != NULL; n = rb_next(n)) {
+	for (n = rb_first(&proc->refs_by_desc); n; n = rb_next(n)) {
 		struct binder_ref *ref = rb_entry(n, struct binder_ref,
 						  rb_node_desc);
 		count++;
@@ -7007,7 +7054,7 @@ static void print_binder_proc_stats(struct seq_file *m,
 	print_binder_stats(m, "  ", &proc->stats);
 }

-static int state_show(struct seq_file *m, void *unused)
+static void print_binder_state(struct seq_file *m, bool hash_ptrs)
 {
 	struct binder_proc *proc;
 	struct binder_node *node;
@@ -7018,31 +7065,40 @@ static int state_show(struct seq_file *m, void *unused)
 	spin_lock(&binder_dead_nodes_lock);
 	if (!hlist_empty(&binder_dead_nodes))
 		seq_puts(m, "dead nodes:\n");
-	hlist_for_each_entry(node, &binder_dead_nodes, dead_node) {
-		/*
-		 * take a temporary reference on the node so it
-		 * survives and isn't removed from the list
-		 * while we print it.
-		 */
-		node->tmp_refs++;
-		spin_unlock(&binder_dead_nodes_lock);
-		if (last_node)
-			binder_put_node(last_node);
-		binder_node_lock(node);
-		print_binder_node_nilocked(m, node);
-		binder_node_unlock(node);
-		last_node = node;
-		spin_lock(&binder_dead_nodes_lock);
-	}
+	hlist_for_each_entry(node, &binder_dead_nodes, dead_node)
+		last_node = print_next_binder_node_ilocked(m, NULL, node,
+							   last_node,
+							   hash_ptrs);
 	spin_unlock(&binder_dead_nodes_lock);
 	if (last_node)
 		binder_put_node(last_node);

 	mutex_lock(&binder_procs_lock);
 	hlist_for_each_entry(proc, &binder_procs, proc_node)
-		print_binder_proc(m, proc, 1);
+		print_binder_proc(m, proc, true, hash_ptrs);
 	mutex_unlock(&binder_procs_lock);
+}

+static void print_binder_transactions(struct seq_file *m, bool hash_ptrs)
+{
+	struct binder_proc *proc;
+
+	seq_puts(m, "binder transactions:\n");
+	mutex_lock(&binder_procs_lock);
+	hlist_for_each_entry(proc, &binder_procs, proc_node)
+		print_binder_proc(m, proc, false, hash_ptrs);
+	mutex_unlock(&binder_procs_lock);
+}
+
+static int state_show(struct seq_file *m, void *unused)
+{
+	print_binder_state(m, false);
+	return 0;
+}
+
+static int state_hashed_show(struct seq_file *m, void *unused)
+{
+	print_binder_state(m, true);
 	return 0;
 }

@@ -7064,14 +7120,13 @@ static int stats_show(struct seq_file *m, void *unused)

 static int transactions_show(struct seq_file *m, void *unused)
 {
-	struct binder_proc *proc;
-
-	seq_puts(m, "binder transactions:\n");
-	mutex_lock(&binder_procs_lock);
-	hlist_for_each_entry(proc, &binder_procs, proc_node)
-		print_binder_proc(m, proc, 0);
-	mutex_unlock(&binder_procs_lock);
+	print_binder_transactions(m, false);
+	return 0;
+}

+static int transactions_hashed_show(struct seq_file *m, void *unused)
+{
+	print_binder_transactions(m, true);
 	return 0;
 }

@@ -7084,7 +7139,7 @@ static int proc_show(struct seq_file *m, void *unused)
 	hlist_for_each_entry(itr, &binder_procs, proc_node) {
 		if (itr->pid == pid) {
 			seq_puts(m, "binder proc state:\n");
-			print_binder_proc(m, itr, 1);
+			print_binder_proc(m, itr, true, false);
 		}
 	}
 	mutex_unlock(&binder_procs_lock);
@@ -7151,8 +7206,10 @@ const struct file_operations binder_fops = {
 };

 DEFINE_SHOW_ATTRIBUTE(state);
+DEFINE_SHOW_ATTRIBUTE(state_hashed);
 DEFINE_SHOW_ATTRIBUTE(stats);
 DEFINE_SHOW_ATTRIBUTE(transactions);
+DEFINE_SHOW_ATTRIBUTE(transactions_hashed);
 DEFINE_SHOW_ATTRIBUTE(transaction_log);

 const struct binder_debugfs_entry binder_debugfs_entries[] = {
@@ -7162,6 +7219,12 @@ const struct binder_debugfs_entry binder_debugfs_entries[] = {
 		.fops = &state_fops,
 		.data = NULL,
 	},
+	{
+		.name = "state_hashed",
+		.mode = 0444,
+		.fops = &state_hashed_fops,
+		.data = NULL,
+	},
 	{
 		.name = "stats",
 		.mode = 0444,
@@ -7174,6 +7237,12 @@ const struct binder_debugfs_entry binder_debugfs_entries[] = {
 		.fops = &transactions_fops,
 		.data = NULL,
 	},
+	{
+		.name = "transactions_hashed",
+		.mode = 0444,
+		.fops = &transactions_hashed_fops,
+		.data = NULL,
+	},
 	{
 		.name = "transaction_log",
 		.mode = 0444,
--- a/drivers/android/vendor_hooks.c
+++ b/drivers/android/vendor_hooks.c
@@ -490,6 +490,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_lruvec_add_folio);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_lruvec_del_folio);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_add_lazyfree_bypass);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_do_async_mmap_readahead);
+EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mm_free_page);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_update_page_mapcount);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_add_page_to_lrulist);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_del_page_from_lrulist);
@@ -676,3 +677,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_filemap_fault_pre_folio_locked);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_filemap_folio_mapped);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_folio_remove_rmap_ptes);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_pageset_update);
+EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_xhci_full_reset_on_remove);
+EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mempool_alloc_skip_wait);
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -1002,7 +1002,7 @@ static enum hrtimer_restart  pm_suspend_timer_fn(struct hrtimer *timer)
 	 * If 'expires' is after the current time, we've been called
 	 * too early.
 	 */
-	if (expires > 0 && expires < ktime_get_mono_fast_ns()) {
+	if (expires > 0 && expires <= ktime_get_mono_fast_ns()) {
 		dev->power.timer_expires = 0;
 		rpm_suspend(dev, dev->power.timer_autosuspends ?
 		    (RPM_ASYNC | RPM_AUTO) : RPM_ASYNC);
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-kvm.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-kvm.c
@@ -284,15 +284,13 @@ static int kvm_arm_smmu_domain_finalize(struct kvm_arm_smmu_domain *kvm_smmu_dom
 		return 0;
 	}

-	kvm_smmu_domain->smmu = smmu;
-
 	if (kvm_smmu_domain->domain.type == IOMMU_DOMAIN_IDENTITY) {
 		kvm_smmu_domain->id = KVM_IOMMU_DOMAIN_IDMAP_ID;
 		/*
 		 * Identity domains doesn't use the DMA API, so no need to
 		 * set the  domain aperture.
 		 */
-		return 0;
+		goto out;
 	}

 	/* Default to stage-1. */
@@ -325,7 +323,13 @@ static int kvm_arm_smmu_domain_finalize(struct kvm_arm_smmu_domain *kvm_smmu_dom

 	ret = kvm_call_hyp_nvhe_mc(__pkvm_host_iommu_alloc_domain,
 				   kvm_smmu_domain->id, kvm_smmu_domain->type);
+	if (ret) {
+		ida_free(&kvm_arm_smmu_domain_ida, kvm_smmu_domain->id);
+		return ret;
+	}

+out:
+	kvm_smmu_domain->smmu = smmu;
 	return ret;
 }

--- a/drivers/ufs/core/ufs-mcq.c
+++ b/drivers/ufs/core/ufs-mcq.c
@@ -629,7 +629,6 @@ int ufshcd_mcq_abort(struct scsi_cmnd *cmd)
 	int tag = scsi_cmd_to_rq(cmd)->tag;
 	struct ufshcd_lrb *lrbp = &hba->lrb[tag];
 	struct ufs_hw_queue *hwq;
-	unsigned long flags;
 	int err;

 	/* Skip task abort in case previous aborts failed and report failure */
@@ -668,10 +667,5 @@ int ufshcd_mcq_abort(struct scsi_cmnd *cmd)
 		return FAILED;
 	}

-	spin_lock_irqsave(&hwq->cq_lock, flags);
-	if (ufshcd_cmd_inflight(lrbp->cmd))
-		ufshcd_release_scsi_cmd(hba, lrbp);
-	spin_unlock_irqrestore(&hwq->cq_lock, flags);
-
 	return SUCCESS;
 }
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -6545,9 +6545,14 @@ static void ufshcd_err_handler(struct work_struct *work)
 		up(&hba->host_sem);
 		return;
 	}
+	spin_unlock_irqrestore(hba->host->host_lock, flags);
+
+	ufshcd_err_handling_prepare(hba);
+
+	spin_lock_irqsave(hba->host->host_lock, flags);
 	ufshcd_set_eh_in_progress(hba);
 	spin_unlock_irqrestore(hba->host->host_lock, flags);
-	ufshcd_err_handling_prepare(hba);
+
 	/* Complete requests that have door-bell cleared by h/w */
 	ufshcd_complete_requests(hba, false);
 	spin_lock_irqsave(hba->host->host_lock, flags);
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include <linux/dmi.h>
 #include <linux/dma-mapping.h>
+#include <trace/hooks/usb.h>

 #include "xhci.h"
 #include "xhci-trace.h"
@@ -196,6 +197,7 @@ int xhci_reset(struct xhci_hcd *xhci, u64 timeout_us)
 	u32 command;
 	u32 state;
 	int ret;
+	bool full_reset = 0;

 	state = readl(&xhci->op_regs->status);

@@ -224,8 +226,11 @@ int xhci_reset(struct xhci_hcd *xhci, u64 timeout_us)
 	if (xhci->quirks & XHCI_INTEL_HOST)
 		udelay(1000);

+	trace_android_vh_xhci_full_reset_on_remove(&full_reset);
+
 	ret = xhci_handshake_check_state(xhci, &xhci->op_regs->command,
-				CMD_RESET, 0, timeout_us, XHCI_STATE_REMOVING);
+				CMD_RESET, 0, timeout_us,
+				full_reset ? 0 : XHCI_STATE_REMOVING);
 	if (ret)
 		return ret;

--- a/drivers/usb/typec/tcpm/tcpm.c
+++ b/drivers/usb/typec/tcpm/tcpm.c
@@ -547,6 +547,14 @@ struct pd_rx_event {
 	struct pd_message msg;
 };

+struct altmode_vdm_event {
+	struct kthread_work work;
+	struct tcpm_port *port;
+	u32 header;
+	u32 *data;
+	int cnt;
+};
+
 static const char * const pd_rev[] = {
 	[PD_REV10]		= "rev1",
 	[PD_REV20]		= "rev2",
@@ -1531,14 +1539,66 @@ static void tcpm_queue_vdm(struct tcpm_port *port, const u32 header,
 	mod_vdm_delayed_work(port, 0);
 }

-static void tcpm_queue_vdm_unlocked(struct tcpm_port *port, const u32 header,
-				    const u32 *data, int cnt)
+static void tcpm_queue_vdm_work(struct kthread_work *work)
 {
+	struct altmode_vdm_event *event = container_of(work,
+						       struct altmode_vdm_event,
+						       work);
+	struct tcpm_port *port = event->port;
+
 	mutex_lock(&port->lock);
-	tcpm_queue_vdm(port, header, data, cnt);
+	if (port->state != SRC_READY && port->state != SNK_READY) {
+		tcpm_log_force(port, "dropping altmode_vdm_event");
+		goto port_unlock;
+	}
+
+	tcpm_queue_vdm(port, event->header, event->data, event->cnt);
+
+port_unlock:
+	kfree(event->data);
+	kfree(event);
 	mutex_unlock(&port->lock);
 }

+static int tcpm_queue_vdm_unlocked(struct tcpm_port *port, const u32 header,
+				   const u32 *data, int cnt)
+{
+	struct altmode_vdm_event *event;
+	u32 *data_cpy;
+	int ret = -ENOMEM;
+
+	event = kzalloc(sizeof(*event), GFP_KERNEL);
+	if (!event)
+		goto err_event;
+
+	data_cpy = kcalloc(cnt, sizeof(u32), GFP_KERNEL);
+	if (!data_cpy)
+		goto err_data;
+
+	kthread_init_work(&event->work, tcpm_queue_vdm_work);
+	event->port = port;
+	event->header = header;
+	memcpy(data_cpy, data, sizeof(u32) * cnt);
+	event->data = data_cpy;
+	event->cnt = cnt;
+
+	ret = kthread_queue_work(port->wq, &event->work);
+	if (!ret) {
+		ret = -EBUSY;
+		goto err_queue;
+	}
+
+	return 0;
+
+err_queue:
+	kfree(data_cpy);
+err_data:
+	kfree(event);
+err_event:
+	tcpm_log_force(port, "failed to queue altmode vdm, err:%d", ret);
+	return ret;
+}
+
 static void svdm_consume_identity(struct tcpm_port *port, const u32 *p, int cnt)
 {
 	u32 vdo = p[VDO_INDEX_IDH];
@@ -2297,8 +2357,7 @@ static int tcpm_altmode_enter(struct typec_altmode *altmode, u32 *vdo)
 	header = VDO(altmode->svid, vdo ? 2 : 1, svdm_version, CMD_ENTER_MODE);
 	header |= VDO_OPOS(altmode->mode);

-	tcpm_queue_vdm_unlocked(port, header, vdo, vdo ? 1 : 0);
-	return 0;
+	return tcpm_queue_vdm_unlocked(port, header, vdo, vdo ? 1 : 0);
 }

 static int tcpm_altmode_exit(struct typec_altmode *altmode)
@@ -2314,8 +2373,7 @@ static int tcpm_altmode_exit(struct typec_altmode *altmode)
 	header = VDO(altmode->svid, 1, svdm_version, CMD_EXIT_MODE);
 	header |= VDO_OPOS(altmode->mode);

-	tcpm_queue_vdm_unlocked(port, header, NULL, 0);
-	return 0;
+	return tcpm_queue_vdm_unlocked(port, header, NULL, 0);
 }

 static int tcpm_altmode_vdm(struct typec_altmode *altmode,
@@ -2323,9 +2381,7 @@ static int tcpm_altmode_vdm(struct typec_altmode *altmode,
 {
 	struct tcpm_port *port = typec_altmode_get_drvdata(altmode);

-	tcpm_queue_vdm_unlocked(port, header, data, count - 1);
-
-	return 0;
+	return tcpm_queue_vdm_unlocked(port, header, data, count - 1);
 }

 static const struct typec_altmode_ops tcpm_altmode_ops = {
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -336,6 +336,7 @@ static struct workqueue_struct *z_erofs_workqueue __read_mostly;

 #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
 static struct kthread_worker __rcu **z_erofs_pcpu_workers;
+static atomic_t erofs_percpu_workers_initialized = ATOMIC_INIT(0);

 static void erofs_destroy_percpu_workers(void)
 {
@@ -381,12 +382,8 @@ static int erofs_init_percpu_workers(void)
 	}
 	return 0;
 }
-#else
-static inline void erofs_destroy_percpu_workers(void) {}
-static inline int erofs_init_percpu_workers(void) { return 0; }
-#endif

-#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_EROFS_FS_PCPU_KTHREAD)
+#ifdef CONFIG_HOTPLUG_CPU
 static DEFINE_SPINLOCK(z_erofs_pcpu_worker_lock);
 static enum cpuhp_state erofs_cpuhp_state;

@@ -443,15 +440,53 @@ static void erofs_cpu_hotplug_destroy(void)
 	if (erofs_cpuhp_state)
 		cpuhp_remove_state_nocalls(erofs_cpuhp_state);
 }
-#else /* !CONFIG_HOTPLUG_CPU || !CONFIG_EROFS_FS_PCPU_KTHREAD */
+#else /* !CONFIG_HOTPLUG_CPU  */
 static inline int erofs_cpu_hotplug_init(void) { return 0; }
 static inline void erofs_cpu_hotplug_destroy(void) {}
-#endif
+#endif/* CONFIG_HOTPLUG_CPU */
+static int z_erofs_init_pcpu_workers(struct super_block *sb)
+{
+	int err;
+
+	if (atomic_xchg(&erofs_percpu_workers_initialized, 1))
+		return 0;
+
+	err = erofs_init_percpu_workers();
+	if (err) {
+		erofs_err(sb, "per-cpu workers: failed to allocate.");
+		goto err_init_percpu_workers;
+	}
+
+	err = erofs_cpu_hotplug_init();
+	if (err < 0) {
+		erofs_err(sb, "per-cpu workers: failed CPU hotplug init.");
+		goto err_cpuhp_init;
+	}
+	erofs_info(sb, "initialized per-cpu workers successfully.");
+	return err;
+
+err_cpuhp_init:
+	erofs_destroy_percpu_workers();
+err_init_percpu_workers:
+	atomic_set(&erofs_percpu_workers_initialized, 0);
+	return err;
+}
+
+static void z_erofs_destroy_pcpu_workers(void)
+{
+	if (!atomic_xchg(&erofs_percpu_workers_initialized, 0))
+		return;
+	erofs_cpu_hotplug_destroy();
+	erofs_destroy_percpu_workers();
+}
+#else /* !CONFIG_EROFS_FS_PCPU_KTHREAD */
+static inline int z_erofs_init_pcpu_workers(struct super_block *sb) { return 0; }
+static inline void z_erofs_destroy_pcpu_workers(void) {}
+#endif/* CONFIG_EROFS_FS_PCPU_KTHREAD */

 void z_erofs_exit_zip_subsystem(void)
 {
-	erofs_cpu_hotplug_destroy();
-	erofs_destroy_percpu_workers();
+	z_erofs_destroy_pcpu_workers();
 	destroy_workqueue(z_erofs_workqueue);
 	z_erofs_destroy_pcluster_pool();
 }
@@ -467,23 +502,12 @@ int __init z_erofs_init_zip_subsystem(void)
 			WQ_UNBOUND | WQ_HIGHPRI, num_possible_cpus());
 	if (!z_erofs_workqueue) {
 		err = -ENOMEM;
-		goto out_error_workqueue_init;
+		goto out_err_workqueue_init;
 	}

-	err = erofs_init_percpu_workers();
-	if (err)
-		goto out_error_pcpu_worker;
-
-	err = erofs_cpu_hotplug_init();
-	if (err < 0)
-		goto out_error_cpuhp_init;
 	return err;

-out_error_cpuhp_init:
-	erofs_destroy_percpu_workers();
-out_error_pcpu_worker:
-	destroy_workqueue(z_erofs_workqueue);
-out_error_workqueue_init:
+out_err_workqueue_init:
 	z_erofs_destroy_pcluster_pool();
 out_error_pcluster_pool:
 	return err;
@@ -711,8 +735,14 @@ static const struct address_space_operations z_erofs_cache_aops = {

 int erofs_init_managed_cache(struct super_block *sb)
 {
-	struct inode *const inode = new_inode(sb);
+	struct inode *inode;
+	int err;

+	err = z_erofs_init_pcpu_workers(sb);
+	if (err)
+		return err;
+
+	inode = new_inode(sb);
 	if (!inode)
 		return -ENOMEM;

--- a/fs/fuse/backing.c
+++ b/fs/fuse/backing.c
@@ -799,6 +799,10 @@ int fuse_file_read_iter_initialize(
 		.size = to->count,
 	};

+	fri->frio = (struct fuse_read_iter_out) {
+		.ret = fri->fri.size,
+	};
+
 	/* TODO we can't assume 'to' is a kvec */
 	/* TODO we also can't assume the vector has only one component */
 	*fa = (struct fuse_bpf_args) {
@@ -833,6 +837,11 @@ int fuse_file_read_iter_backing(struct fuse_bpf_args *fa,
 	if (!iov_iter_count(to))
 		return 0;

+	if ((iocb->ki_flags & IOCB_DIRECT) &&
+	    (!ff->backing_file->f_mapping->a_ops ||
+	     !ff->backing_file->f_mapping->a_ops->direct_IO))
+		return -EINVAL;
+
 	/* TODO This just plain ignores any change to fuse_read_in */
 	if (is_sync_kiocb(iocb)) {
 		ret = vfs_iter_read(ff->backing_file, to, &iocb->ki_pos,
@@ -855,13 +864,14 @@ int fuse_file_read_iter_backing(struct fuse_bpf_args *fa,
 			fuse_bpf_aio_cleanup_handler(aio_req);
 	}

+	frio->ret = ret;
+
 	/* TODO Need to point value at the buffer for post-modification */

 out:
 	fuse_file_accessed(file, ff->backing_file);

-	frio->ret = ret;
-	return ret < 0 ? ret : 0;
+	return ret;
 }

 void *fuse_file_read_iter_finalize(struct fuse_bpf_args *fa,
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -41,6 +41,24 @@ struct poll_table_struct;

 /* define the enumeration of all cgroup subsystems */
 #define SUBSYS(_x) _x ## _cgrp_id,
+
+#define CSS_COUNTERS_SIZE (CGROUP_SUBSYS_COUNT * sizeof(atomic_t))
+
+/*
+ * This should just use max(), but max() doesn't work in struct definitions.
+ *
+ * Originally, the space was reserved for per cgroup subsystem counters, where each counter was
+ * the size of an atomic_t variable. However, it was later reused to fit a struct rcu_head
+ * which is why the calculation considers the size of struct rcu_head.
+ *
+ * This macro is provided to ANDROID_BACKPORT_USE_ARRAY() which needs to reserve at least
+ * enough memory to accommodate struct rcu_head. However, if we only reserve CSS_COUNTERS_SIZE,
+ * that may not be enough space on kernels with a small amount of cgroup subsystems enabled. So,
+ * we take the max between the two values to use in ANDROID_BACKPORT_USE_ARRAY().
+ */
+#define CGROUP_ROOT_BACKPORT_PADDING_SIZE \
+	(CSS_COUNTERS_SIZE > sizeof(struct rcu_head) ? CSS_COUNTERS_SIZE : sizeof(struct rcu_head))
+
 enum cgroup_subsys_id {
 #include <linux/cgroup_subsys.h>
 	CGROUP_SUBSYS_COUNT,
@@ -585,8 +603,12 @@ struct cgroup_root {
 	/* The name for this hierarchy - may be empty */
 	char name[MAX_CGROUP_ROOT_NAMELEN];

-	ANDROID_BACKPORT_USE_ARRAY(1, CGROUP_SUBSYS_COUNT * sizeof(atomic_t),
-				   struct rcu_head rcu);
+	/* Use the original calculation to preserve the CRC value for the ABI. */
+#ifndef __GENKSYMS__
+	ANDROID_BACKPORT_USE_ARRAY(1, CGROUP_ROOT_BACKPORT_PADDING_SIZE, struct rcu_head rcu);
+#else
+	ANDROID_BACKPORT_USE_ARRAY(1, CGROUP_SUBSYS_COUNT * sizeof(atomic_t), struct rcu_head rcu);
+#endif
 };

 /*
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -277,15 +277,25 @@ struct mthp_stat {
 #ifdef CONFIG_SYSFS
 DECLARE_PER_CPU(struct mthp_stat, mthp_stats);

-static inline void count_mthp_stat(int order, enum mthp_stat_item item)
+static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
 {
 	if (order <= 0 || order > PMD_ORDER)
 		return;

-	this_cpu_inc(mthp_stats.stats[order][item]);
+	this_cpu_add(mthp_stats.stats[order][item], delta);
 }
+
+static inline void count_mthp_stat(int order, enum mthp_stat_item item)
+{
+	mod_mthp_stat(order, item, 1);
+}
+
 unsigned long sum_mthp_stat(int order, enum mthp_stat_item item);
 #else
+static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
+{
+}
+
 static inline void count_mthp_stat(int order, enum mthp_stat_item item)
 {
 }
@@ -326,7 +336,7 @@ static inline int split_huge_page(struct page *page)
 {
 	return split_huge_page_to_list(page, NULL);
 }
-void deferred_split_folio(struct folio *folio);
+void deferred_split_folio(struct folio *folio, bool partially_mapped);

 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long address, bool freeze, struct folio *folio);
@@ -486,7 +496,7 @@ static inline int split_huge_page(struct page *page)
 {
 	return 0;
 }
-static inline void deferred_split_folio(struct folio *folio) {}
+static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
 #define split_huge_pmd(__vma, __pmd, __address)	\
 	do { } while (0)

--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -4,6 +4,7 @@

 #include <linux/sched/coredump.h> /* MMF_VM_HUGEPAGE */

+extern unsigned int khugepaged_max_ptes_none __read_mostly;
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 extern struct attribute_group khugepaged_attr_group;

--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -731,8 +731,15 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
 	__mem_cgroup_uncharge_list(page_list);
 }

-void mem_cgroup_replace_folio(struct folio *old, struct folio *new);
+void __mem_cgroup_uncharge_folios(struct folio_batch *folios);
+static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios)
+{
+	if (mem_cgroup_disabled())
+		return;
+	__mem_cgroup_uncharge_folios(folios);
+}

+void mem_cgroup_replace_folio(struct folio *old, struct folio *new);
 void mem_cgroup_migrate(struct folio *old, struct folio *new);

 /**
@@ -1171,6 +1178,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
 						gfp_t gfp_mask,
 						unsigned long *total_scanned);

+extern int mem_cgroup_init(void);
 #else /* CONFIG_MEMCG */

 #define MEM_CGROUP_ID_SHIFT	0
@@ -1297,6 +1305,10 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
 {
 }

+static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios)
+{
+}
+
 static inline void mem_cgroup_replace_folio(struct folio *old,
 		struct folio *new)
 {
@@ -1619,6 +1631,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
 {
 	return 0;
 }
+
+static inline int mem_cgroup_init(void) { return 0; }
 #endif /* CONFIG_MEMCG */

 static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
@@ -1682,18 +1696,18 @@ static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio,
 	return folio_lruvec_lock_irq(folio);
 }

-/* Don't lock again iff page's lruvec locked */
-static inline struct lruvec *folio_lruvec_relock_irqsave(struct folio *folio,
-		struct lruvec *locked_lruvec, unsigned long *flags)
+/* Don't lock again iff folio's lruvec locked */
+static inline void folio_lruvec_relock_irqsave(struct folio *folio,
+		struct lruvec **lruvecp, unsigned long *flags)
 {
-	if (locked_lruvec) {
-		if (folio_matches_lruvec(folio, locked_lruvec))
-			return locked_lruvec;
+	if (*lruvecp) {
+		if (folio_matches_lruvec(folio, *lruvecp))
+			return;

-		unlock_page_lruvec_irqrestore(locked_lruvec, *flags);
+		unlock_page_lruvec_irqrestore(*lruvecp, *flags);
 	}

-	return folio_lruvec_lock_irqsave(folio, flags);
+	*lruvecp = folio_lruvec_lock_irqsave(folio, flags);
 }

 #ifdef CONFIG_CGROUP_WRITEBACK
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -39,6 +39,7 @@ struct anon_vma;
 struct anon_vma_chain;
 struct user_struct;
 struct pt_regs;
+struct folio_batch;

 extern int sysctl_page_lock_unfairness;

@@ -1539,6 +1540,8 @@ static inline void folio_put_refs(struct folio *folio, int refs)
 		__folio_put(folio);
 }

+void folios_put_refs(struct folio_batch *folios, unsigned int *refs);
+
 /*
 * union release_pages_arg - an array of pages or folios
 *
@@ -1561,18 +1564,19 @@ void release_pages(release_pages_arg, int nr);
 /**
 * folios_put - Decrement the reference count on an array of folios.
 * @folios: The folios.
- * @nr: How many folios there are.
 *
- * Like folio_put(), but for an array of folios.  This is more efficient
- * than writing the loop yourself as it will optimise the locks which
- * need to be taken if the folios are freed.
+ * Like folio_put(), but for a batch of folios.  This is more efficient
+ * than writing the loop yourself as it will optimise the locks which need
+ * to be taken if the folios are freed.  The folios batch is returned
+ * empty and ready to be reused for another batch; there is no need to
+ * reinitialise it.
 *
 * Context: May be called in process or interrupt context, but not in NMI
 * context.  May be called while holding a spinlock.
 */
-static inline void folios_put(struct folio **folios, unsigned int nr)
+static inline void folios_put(struct folio_batch *folios)
 {
-	release_pages(folios, nr);
+	folios_put_refs(folios, NULL);
 }

 static inline void put_page(struct page *page)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -37,6 +37,22 @@

 #define NR_PAGE_ORDERS (MAX_ORDER + 1)

+/* Defines the order for the number of pages that have a migrate type. */
+#ifndef CONFIG_PAGE_BLOCK_ORDER
+#define PAGE_BLOCK_ORDER MAX_ORDER
+#else
+#define PAGE_BLOCK_ORDER CONFIG_PAGE_BLOCK_ORDER
+#endif /* CONFIG_PAGE_BLOCK_ORDER */
+
+/*
+ * The MAX_ORDER, which defines the max order of pages to be allocated
+ * by the buddy allocator, has to be larger or equal to the PAGE_BLOCK_ORDER,
+ * which defines the order for the number of pages that can have a migrate type
+ */
+#if (PAGE_BLOCK_ORDER > MAX_ORDER)
+#error MAX_ORDER must be >= PAGE_BLOCK_ORDER
+#endif
+
 /*
 * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
 * costly to service.  That is between allocation orders which should
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -197,6 +197,7 @@ enum pageflags {
 	/* At least one page in this folio has the hwpoison flag set */
 	PG_has_hwpoisoned = PG_error,
 	PG_large_rmappable = PG_workingset, /* anon or file-backed */
+	PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */
 };

 #define PAGEFLAGS_MASK		((1UL << NR_PAGEFLAGS) - 1)
@@ -372,54 +373,77 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n)
 #define FOLIO_PF_NO_COMPOUND	0
 #define FOLIO_PF_SECOND		1

+#define FOLIO_HEAD_PAGE		0
+#define FOLIO_SECOND_PAGE	1
+
 /*
 * Macros to create function definitions for page flags
 */
+#define FOLIO_TEST_FLAG(name, page)					\
+static __always_inline bool folio_test_##name(struct folio *folio)	\
+{ return test_bit(PG_##name, folio_flags(folio, page)); }
+
+#define FOLIO_SET_FLAG(name, page)					\
+static __always_inline void folio_set_##name(struct folio *folio)	\
+{ set_bit(PG_##name, folio_flags(folio, page)); }
+
+#define FOLIO_CLEAR_FLAG(name, page)					\
+static __always_inline void folio_clear_##name(struct folio *folio)	\
+{ clear_bit(PG_##name, folio_flags(folio, page)); }
+
+#define __FOLIO_SET_FLAG(name, page)					\
+static __always_inline void __folio_set_##name(struct folio *folio)	\
+{ __set_bit(PG_##name, folio_flags(folio, page)); }
+
+#define __FOLIO_CLEAR_FLAG(name, page)					\
+static __always_inline void __folio_clear_##name(struct folio *folio)	\
+{ __clear_bit(PG_##name, folio_flags(folio, page)); }
+
+#define FOLIO_TEST_SET_FLAG(name, page)					\
+static __always_inline bool folio_test_set_##name(struct folio *folio)	\
+{ return test_and_set_bit(PG_##name, folio_flags(folio, page)); }
+
+#define FOLIO_TEST_CLEAR_FLAG(name, page)				\
+static __always_inline bool folio_test_clear_##name(struct folio *folio) \
+{ return test_and_clear_bit(PG_##name, folio_flags(folio, page)); }
+
+#define FOLIO_FLAG(name, page)						\
+FOLIO_TEST_FLAG(name, page)						\
+FOLIO_SET_FLAG(name, page)						\
+FOLIO_CLEAR_FLAG(name, page)
+
 #define TESTPAGEFLAG(uname, lname, policy)				\
-static __always_inline bool folio_test_##lname(struct folio *folio)	\
-{ return test_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); }	\
+FOLIO_TEST_FLAG(lname, FOLIO_##policy)					\
 static __always_inline int Page##uname(struct page *page)		\
 { return test_bit(PG_##lname, &policy(page, 0)->flags); }

 #define SETPAGEFLAG(uname, lname, policy)				\
-static __always_inline							\
-void folio_set_##lname(struct folio *folio)				\
-{ set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); }		\
+FOLIO_SET_FLAG(lname, FOLIO_##policy)					\
 static __always_inline void SetPage##uname(struct page *page)		\
 { set_bit(PG_##lname, &policy(page, 1)->flags); }

 #define CLEARPAGEFLAG(uname, lname, policy)				\
-static __always_inline							\
-void folio_clear_##lname(struct folio *folio)				\
-{ clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); }		\
+FOLIO_CLEAR_FLAG(lname, FOLIO_##policy)					\
 static __always_inline void ClearPage##uname(struct page *page)		\
 { clear_bit(PG_##lname, &policy(page, 1)->flags); }

 #define __SETPAGEFLAG(uname, lname, policy)				\
-static __always_inline							\
-void __folio_set_##lname(struct folio *folio)				\
-{ __set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); }		\
+__FOLIO_SET_FLAG(lname, FOLIO_##policy)					\
 static __always_inline void __SetPage##uname(struct page *page)		\
 { __set_bit(PG_##lname, &policy(page, 1)->flags); }

 #define __CLEARPAGEFLAG(uname, lname, policy)				\
-static __always_inline							\
-void __folio_clear_##lname(struct folio *folio)				\
-{ __clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); }	\
+__FOLIO_CLEAR_FLAG(lname, FOLIO_##policy)				\
 static __always_inline void __ClearPage##uname(struct page *page)	\
 { __clear_bit(PG_##lname, &policy(page, 1)->flags); }

 #define TESTSETFLAG(uname, lname, policy)				\
-static __always_inline							\
-bool folio_test_set_##lname(struct folio *folio)			\
-{ return test_and_set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \
+FOLIO_TEST_SET_FLAG(lname, FOLIO_##policy)				\
 static __always_inline int TestSetPage##uname(struct page *page)	\
 { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }

 #define TESTCLEARFLAG(uname, lname, policy)				\
-static __always_inline							\
-bool folio_test_clear_##lname(struct folio *folio)			\
-{ return test_and_clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \
+FOLIO_TEST_CLEAR_FLAG(lname, FOLIO_##policy)				\
 static __always_inline int TestClearPage##uname(struct page *page)	\
 { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }

@@ -842,8 +866,18 @@ static inline void ClearPageCompound(struct page *page)
 	ClearPageHead(page);
 }
 PAGEFLAG(LargeRmappable, large_rmappable, PF_SECOND)
+FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
+/*
+ * PG_partially_mapped is protected by deferred_split split_queue_lock,
+ * so its safe to use non-atomic set/clear.
+ */
+__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
+__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
 #else
 TESTPAGEFLAG_FALSE(LargeRmappable, large_rmappable)
+FOLIO_TEST_FLAG_FALSE(partially_mapped)
+__FOLIO_SET_FLAG_NOOP(partially_mapped)
+__FOLIO_CLEAR_FLAG_NOOP(partially_mapped)
 #endif

 #define PG_head_mask ((1UL << PG_head))
@@ -1111,7 +1145,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
 */
 #define PAGE_FLAGS_SECOND						\
 	(0xffUL /* order */		| 1UL << PG_has_hwpoisoned |	\
-	 1UL << PG_large_rmappable)
+	 1UL << PG_large_rmappable	| 1UL << PG_partially_mapped)

 #define PAGE_FLAGS_PRIVATE				\
 	(1UL << PG_private | 1UL << PG_private_2)
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -3,10 +3,6 @@
 #define __LINUX_PAGEISOLATION_H

 #ifdef CONFIG_MEMORY_ISOLATION
-static inline bool has_isolate_pageblock(struct zone *zone)
-{
-	return zone->nr_isolate_pageblock;
-}
 static inline bool is_migrate_isolate_page(struct page *page)
 {
 	return get_pageblock_migratetype(page) == MIGRATE_ISOLATE;
@@ -16,10 +12,6 @@ static inline bool is_migrate_isolate(int migratetype)
 	return migratetype == MIGRATE_ISOLATE;
 }
 #else
-static inline bool has_isolate_pageblock(struct zone *zone)
-{
-	return false;
-}
 static inline bool is_migrate_isolate_page(struct page *page)
 {
 	return false;
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -28,7 +28,7 @@ enum pageblock_bits {
 	NR_PAGEBLOCK_BITS
 };

-#ifdef CONFIG_HUGETLB_PAGE
+#if defined(CONFIG_HUGETLB_PAGE)

 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

@@ -41,14 +41,18 @@ extern unsigned int pageblock_order;
 * Huge pages are a constant size, but don't exceed the maximum allocation
 * granularity.
 */
-#define pageblock_order		min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_ORDER)
+#define pageblock_order		min_t(unsigned int, HUGETLB_PAGE_ORDER, PAGE_BLOCK_ORDER)

 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

-#else /* CONFIG_HUGETLB_PAGE */
+#elif defined(CONFIG_TRANSPARENT_HUGEPAGE)
+
+#define pageblock_order		min_t(unsigned int, HPAGE_PMD_ORDER, PAGE_BLOCK_ORDER)
+
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */

 /* If huge pages are not used, group by MAX_ORDER_NR_PAGES */
-#define pageblock_order		MAX_ORDER
+#define pageblock_order		PAGE_BLOCK_ORDER

 #endif /* CONFIG_HUGETLB_PAGE */

--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -742,7 +742,12 @@ int folio_mkclean(struct folio *);
 int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
 		      struct vm_area_struct *vma);

-void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
+enum rmp_flags {
+	RMP_LOCKED		= 1 << 0,
+	RMP_USE_SHARED_ZEROPAGE	= 1 << 1,
+};
+
+void remove_migration_ptes(struct folio *src, struct folio *dst, int flags);

 int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);

--- a/include/linux/trace.h
+++ b/include/linux/trace.h
@@ -52,6 +52,8 @@ int trace_array_printk(struct trace_array *tr, unsigned long ip,
 int trace_array_init_printk(struct trace_array *tr);
 void trace_array_put(struct trace_array *tr);
 struct trace_array *trace_array_get_by_name(const char *name);
+struct trace_array *trace_array_get_by_name_ext(const char *name,
+						const char *systems);
 int trace_array_destroy(struct trace_array *tr);

 /* For osnoise tracer */
@@ -88,6 +90,11 @@ static inline struct trace_array *trace_array_get_by_name(const char *name)
 {
 	return NULL;
 }
+static inline struct trace_array *trace_array_get_by_name_ext(
+	const char *name, const char *systems)
+{
+	return NULL;
+}
 static inline int trace_array_destroy(struct trace_array *tr)
 {
 	return 0;
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -8,21 +8,46 @@
 #include <linux/refcount.h>
 #include <net/sock.h>

-void unix_inflight(struct user_struct *user, struct file *fp);
-void unix_notinflight(struct user_struct *user, struct file *fp);
-void unix_destruct_scm(struct sk_buff *skb);
-void io_uring_destruct_scm(struct sk_buff *skb);
-void unix_gc(void);
-void wait_for_unix_gc(void);
+#if IS_ENABLED(CONFIG_UNIX)
 struct unix_sock *unix_get_socket(struct file *filp);
+#else
+static inline struct unix_sock *unix_get_socket(struct file *filp)
+{
+	return NULL;
+}
+#endif
+
+extern unsigned int unix_tot_inflight;
+void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver);
+void unix_del_edges(struct scm_fp_list *fpl);
+void unix_update_edges(struct unix_sock *receiver);
+int unix_prepare_fpl(struct scm_fp_list *fpl);
+void unix_destroy_fpl(struct scm_fp_list *fpl);
+void unix_gc(void);
+void wait_for_unix_gc(struct scm_fp_list *fpl);
+
+struct unix_vertex {
+	struct list_head edges;
+	struct list_head entry;
+	struct list_head scc_entry;
+	unsigned long out_degree;
+	unsigned long index;
+	unsigned long scc_index;
+};
+
+struct unix_edge {
+	struct unix_sock *predecessor;
+	struct unix_sock *successor;
+	struct list_head vertex_entry;
+	struct list_head stack_entry;
+};
+
 struct sock *unix_peer_get(struct sock *sk);

 #define UNIX_HASH_MOD	(256 - 1)
 #define UNIX_HASH_SIZE	(256 * 2)
 #define UNIX_HASH_BITS	8

-extern unsigned int unix_tot_inflight;
-
 struct unix_address {
 	refcount_t	refcnt;
 	int		len;
@@ -42,6 +67,7 @@ struct unix_skb_parms {

 struct scm_stat {
 	atomic_t nr_fds;
+	unsigned long nr_unix_fds;
 };

 #define UNIXCB(skb)	(*(struct unix_skb_parms *)&((skb)->cb))
@@ -54,12 +80,9 @@ struct unix_sock {
 	struct path		path;
 	struct mutex		iolock, bindlock;
 	struct sock		*peer;
-	struct list_head	link;
-	unsigned long		inflight;
+	struct unix_vertex	*vertex;
+	struct sock		*listener;
 	spinlock_t		lock;
-	unsigned long		gc_flags;
-#define UNIX_GC_CANDIDATE	0
-#define UNIX_GC_MAYBE_CYCLE	1
 	struct socket_wq	peer_wq;
 	wait_queue_entry_t	peer_wake;
 	struct scm_stat		scm_stat;
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -22,11 +22,24 @@ struct scm_creds {
 	kgid_t	gid;
 };

+#ifdef CONFIG_UNIX
+struct unix_edge;
+#endif
+
 struct scm_fp_list {
 	short			count;
 	short			max;
 	struct user_struct	*user;
 	struct file		*fp[SCM_MAX_FD];
+#ifndef __GENKSYMS__
+#ifdef CONFIG_UNIX
+	bool			inflight;
+	bool			dead;
+	struct list_head	vertices;
+	struct unix_edge        *edges;
+#endif
+	short			count_unix;
+#endif
 };

 struct scm_cookie {
--- a/include/trace/hooks/mm.h
+++ b/include/trace/hooks/mm.h
@@ -431,6 +431,9 @@ DECLARE_HOOK(android_vh_add_lazyfree_bypass,
 DECLARE_HOOK(android_vh_do_async_mmap_readahead,
 	TP_PROTO(struct vm_fault *vmf, struct folio *folio, bool *skip),
 	TP_ARGS(vmf, folio, skip));
+DECLARE_HOOK(android_vh_mm_free_page,
+	TP_PROTO(struct page *page),
+	TP_ARGS(page));

 DECLARE_HOOK(android_vh_cma_debug_show_areas,
 	TP_PROTO(bool *show),
@@ -596,6 +599,9 @@ DECLARE_HOOK(android_vh_folio_remove_rmap_ptes,
 DECLARE_HOOK(android_vh_pageset_update,
 	TP_PROTO(unsigned long *high, unsigned long *batch),
 	TP_ARGS(high, batch));
+DECLARE_HOOK(android_vh_mempool_alloc_skip_wait,
+	TP_PROTO(gfp_t *gfp_flags, bool *skip_wait),
+	TP_ARGS(gfp_flags, skip_wait));
 #endif /* _TRACE_HOOK_MM_H */

 /* This part must be outside protection */
--- a/include/trace/hooks/usb.h
+++ b/include/trace/hooks/usb.h
@@ -31,6 +31,10 @@ DECLARE_HOOK(android_vh_usb_new_device_added,
 	TP_PROTO(struct usb_device *udev, int *err),
 	TP_ARGS(udev, err));

+DECLARE_HOOK(android_vh_xhci_full_reset_on_remove,
+	TP_PROTO(bool *full_reset),
+	TP_ARGS(full_reset));
+
 #endif /*  _TRACE_HOOK_USB_H */
 /*  This part must be outside protection */
 #include <trace/define_trace.h>
--- a/init/main.c
+++ b/init/main.c
@@ -50,6 +50,7 @@
 #include <linux/writeback.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
+#include <linux/memcontrol.h>
 #include <linux/cgroup.h>
 #include <linux/efi.h>
 #include <linux/tick.h>
@@ -1062,6 +1063,7 @@ void start_kernel(void)
 	proc_root_init();
 	nsfs_init();
 	cpuset_init();
+	mem_cgroup_init();
 	cgroup_init();
 	taskstats_init_early();
 	delayacct_init();
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -452,7 +452,7 @@ struct kmem_cache *files_cachep;
 struct kmem_cache *fs_cachep;

 /* SLAB cache for vm_area_struct structures */
-static struct kmem_cache *vm_area_cachep;
+struct kmem_cache *vm_area_cachep;

 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -227,6 +227,14 @@ __irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff,

 void irq_startup_managed(struct irq_desc *desc)
 {
+	struct irq_data *d = irq_desc_get_irq_data(desc);
+
+	/*
+	 * Clear managed-shutdown flag, so we don't repeat managed-startup for
+	 * multiple hotplugs, and cause imbalanced disable depth.
+	 */
+	irqd_clr_managed_shutdown(d);
+
 	/*
 	 * Only start it up when the disable depth is 1, so that a disable,
 	 * hotunplug, hotplug sequence does not end up enabling it during
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -211,13 +211,6 @@ static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu)
 	    !irq_data_get_irq_chip(data) || !cpumask_test_cpu(cpu, affinity))
 		return;

-	/*
-	 * Don't restore suspended interrupts here when a system comes back
-	 * from S3. They are reenabled via resume_device_irqs().
-	 */
-	if (desc->istate & IRQS_SUSPENDED)
-		return;
-
 	if (irqd_is_managed_and_shutdown(data))
 		irq_startup_managed(desc);

--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9538,16 +9538,19 @@ static int trace_array_create_dir(struct trace_array *tr)
 	return ret;
 }

-static struct trace_array *trace_array_create(const char *name)
+static struct trace_array *
+trace_array_create_systems(const char *name, const char *systems)
 {
+	struct trace_array_ext *tr_ext;
 	struct trace_array *tr;
 	int ret;

 	ret = -ENOMEM;
-	tr = kzalloc(sizeof(*tr), GFP_KERNEL);
-	if (!tr)
+	tr_ext = kzalloc(sizeof(*tr_ext), GFP_KERNEL);
+	if (!tr_ext)
 		return ERR_PTR(ret);

+	tr = &tr_ext->trace_array;
 	tr->name = kstrdup(name, GFP_KERNEL);
 	if (!tr->name)
 		goto out_free_tr;
@@ -9558,6 +9561,12 @@ static struct trace_array *trace_array_create(const char *name)
 	if (!zalloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL))
 		goto out_free_tr;

+	if (systems) {
+		tr_ext->system_names = kstrdup_const(systems, GFP_KERNEL);
+		if (!tr_ext->system_names)
+			goto out_free_tr;
+	}
+
 	tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;

 	cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
@@ -9601,12 +9610,18 @@ static struct trace_array *trace_array_create(const char *name)
 	free_trace_buffers(tr);
 	free_cpumask_var(tr->pipe_cpumask);
 	free_cpumask_var(tr->tracing_cpumask);
+	kfree_const(tr_ext->system_names);
 	kfree(tr->name);
-	kfree(tr);
+	kfree(tr_ext);

 	return ERR_PTR(ret);
 }

+static struct trace_array *trace_array_create(const char *name)
+{
+	return trace_array_create_systems(name, NULL);
+}
+
 static int instance_mkdir(const char *name)
 {
 	struct trace_array *tr;
@@ -9629,9 +9644,27 @@ out_unlock:
 	return ret;
 }

+const char *trace_array_get_system_names(struct trace_array *tr)
+{
+	struct trace_array_ext *tr_ext;
+
+	if (tr == &global_trace)
+		return NULL;
+
+	tr_ext = container_of(tr, struct trace_array_ext, trace_array);
+	return tr_ext->system_names;
+}
+
+struct trace_array *trace_array_get_by_name(const char *name)
+{
+	return trace_array_get_by_name_ext(name, NULL);
+}
+EXPORT_SYMBOL_GPL(trace_array_get_by_name);
+
 /**
- * trace_array_get_by_name - Create/Lookup a trace array, given its name.
+ * trace_array_get_by_name_ext - Create/Lookup a trace array, given its name.
 * @name: The name of the trace array to be looked up/created.
+ * @systems: A list of systems to create event directories for (NULL for all)
 *
 * Returns pointer to trace array with given name.
 * NULL, if it cannot be created.
@@ -9645,7 +9678,8 @@ out_unlock:
 * trace_array_put() is called, user space can not delete it.
 *
 */
-struct trace_array *trace_array_get_by_name(const char *name)
+struct trace_array *trace_array_get_by_name_ext(const char *name,
+						const char *systems)
 {
 	struct trace_array *tr;

@@ -9657,7 +9691,7 @@ struct trace_array *trace_array_get_by_name(const char *name)
 			goto out_unlock;
 	}

-	tr = trace_array_create(name);
+	tr = trace_array_create_systems(name, systems);

 	if (IS_ERR(tr))
 		tr = NULL;
@@ -9669,11 +9703,14 @@ out_unlock:
 	mutex_unlock(&event_mutex);
 	return tr;
 }
-EXPORT_SYMBOL_GPL(trace_array_get_by_name);
+EXPORT_SYMBOL_GPL(trace_array_get_by_name_ext);

 static int __remove_instance(struct trace_array *tr)
 {
 	int i;
+	struct trace_array_ext *tr_ext = container_of(tr,
+						      struct trace_array_ext,
+						      trace_array);

 	/* Reference counter for a newly created trace array = 1. */
 	if (tr->ref > 1 || (tr->current_trace && tr->trace_ref))
@@ -9704,8 +9741,9 @@ static int __remove_instance(struct trace_array *tr)

 	free_cpumask_var(tr->pipe_cpumask);
 	free_cpumask_var(tr->tracing_cpumask);
+	kfree_const(tr_ext->system_names);
 	kfree(tr->name);
-	kfree(tr);
+	kfree(tr_ext);

 	return 0;
 }
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -412,6 +412,11 @@ struct trace_array {
 	struct trace_func_repeats	__percpu *last_func_repeats;
 };

+struct trace_array_ext {
+	const char		*system_names;
+	struct trace_array	trace_array;
+};
+
 enum {
 	TRACE_ARRAY_FL_GLOBAL	= (1 << 0)
 };
@@ -420,6 +425,7 @@ extern struct list_head ftrace_trace_arrays;

 extern struct mutex trace_types_lock;

+extern const char *trace_array_get_system_names(struct trace_array *tr);
 extern int trace_array_get(struct trace_array *tr);
 extern int tracing_check_open_get_tr(struct trace_array *tr);
 extern struct trace_array *trace_array_find(const char *instance);
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -3041,6 +3041,27 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
 	up_write(&trace_event_sem);
 }

+static bool event_in_systems(struct trace_event_call *call,
+			     const char *systems)
+{
+	const char *system;
+	const char *p;
+
+	if (!systems)
+		return true;
+
+	system = call->class->system;
+	p = strstr(systems, system);
+	if (!p)
+		return false;
+
+	if (p != systems && !isspace(*(p - 1)) && *(p - 1) != ',')
+		return false;
+
+	p += strlen(system);
+	return !*p || isspace(*p) || *p == ',';
+}
+
 static struct trace_event_file *
 trace_create_new_event(struct trace_event_call *call,
 		       struct trace_array *tr)
@@ -3050,9 +3071,12 @@ trace_create_new_event(struct trace_event_call *call,
 	struct trace_event_file *file;
 	unsigned int first;

+	if (!event_in_systems(call, trace_array_get_system_names(tr)))
+		return NULL;
+
 	file = kmem_cache_alloc(file_cachep, GFP_TRACE);
 	if (!file)
-		return NULL;
+		return ERR_PTR(-ENOMEM);

 	pid_list = rcu_dereference_protected(tr->filtered_pids,
 					     lockdep_is_held(&event_mutex));
@@ -3117,8 +3141,17 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr)
 	struct trace_event_file *file;

 	file = trace_create_new_event(call, tr);
+	/*
+	 * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed
+	 * allocation, or NULL if the event is not part of the tr->system_names.
+	 * When the event is not part of the tr->system_names, return zero, not
+	 * an error.
+	 */
 	if (!file)
-		return -ENOMEM;
+		return 0;
+
+	if (IS_ERR(file))
+		return PTR_ERR(file);

 	if (eventdir_initialized)
 		return event_create_dir(tr->event_dir, file);
@@ -3157,8 +3190,17 @@ __trace_early_add_new_event(struct trace_event_call *call,
 	int ret;

 	file = trace_create_new_event(call, tr);
+	/*
+	 * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed
+	 * allocation, or NULL if the event is not part of the tr->system_names.
+	 * When the event is not part of the tr->system_names, return zero, not
+	 * an error.
+	 */
 	if (!file)
-		return -ENOMEM;
+		return 0;
+
+	if (IS_ERR(file))
+		return PTR_ERR(file);

 	ret = event_define_fields(call);
 	if (ret)
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -994,6 +994,40 @@ config CMA_AREAS

 	  If unsure, leave the default value "7" in UMA and "19" in NUMA.

+#
+# Select this config option from the architecture Kconfig, if available, to set
+# the max page order for physically contiguous allocations.
+#
+config ARCH_FORCE_MAX_ORDER
+	int
+
+#
+# When ARCH_FORCE_MAX_ORDER is not defined,
+# the default page block order is MAX_PAGE_ORDER (10) as per
+# include/linux/mmzone.h.
+#
+config PAGE_BLOCK_ORDER
+	int "Page Block Order"
+	range 1 10 if ARCH_FORCE_MAX_ORDER = 0 || ARCH_FORCE_MAX_ORDER = ""
+	default 10 if ARCH_FORCE_MAX_ORDER = 0 || ARCH_FORCE_MAX_ORDER = ""
+	range 1 ARCH_FORCE_MAX_ORDER if ARCH_FORCE_MAX_ORDER != 0
+	default ARCH_FORCE_MAX_ORDER if ARCH_FORCE_MAX_ORDER != 0
+	help
+	  The page block order refers to the power of two number of pages that
+	  are physically contiguous and can have a migrate type associated to
+	  them. The maximum size of the page block order is limited by
+	  ARCH_FORCE_MAX_ORDER.
+
+	  This config allows overriding the default page block order when the
+	  page block order is required to be smaller than ARCH_FORCE_MAX_ORDER
+	  or MAX_ORDER.
+
+	  Reducing pageblock order can negatively impact THP generation
+	  success rate. If your workloads uses THP heavily, please use this
+	  option with caution.
+
+	  Don't change if unsure.
+
 config MEM_SOFT_DIRTY
 	bool "Track memory changes"
 	depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -70,6 +70,7 @@ unsigned long transparent_hugepage_flags __read_mostly =
 	(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);

 static struct shrinker deferred_split_shrinker;
+static bool split_underused_thp = true;

 static atomic_t huge_zero_refcount;
 struct page *huge_zero_page __read_mostly;
@@ -423,6 +424,27 @@ static ssize_t hpage_pmd_size_show(struct kobject *kobj,
 static struct kobj_attribute hpage_pmd_size_attr =
 	__ATTR_RO(hpage_pmd_size);

+static ssize_t split_underused_thp_show(struct kobject *kobj,
+			    struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%d\n", split_underused_thp);
+}
+
+static ssize_t split_underused_thp_store(struct kobject *kobj,
+			     struct kobj_attribute *attr,
+			     const char *buf, size_t count)
+{
+	int err = kstrtobool(buf, &split_underused_thp);
+
+	if (err < 0)
+		return err;
+
+	return count;
+}
+
+static struct kobj_attribute split_underused_thp_attr = __ATTR(
+	shrink_underused, 0644, split_underused_thp_show, split_underused_thp_store);
+
 static struct attribute *hugepage_attr[] = {
 	&enabled_attr.attr,
 	&defrag_attr.attr,
@@ -431,6 +453,7 @@ static struct attribute *hugepage_attr[] = {
 #ifdef CONFIG_SHMEM
 	&shmem_enabled_attr.attr,
 #endif
+	&split_underused_thp_attr.attr,
 	NULL,
 };

@@ -1046,6 +1069,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 		update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
 		add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
 		mm_inc_nr_ptes(vma->vm_mm);
+		deferred_split_folio(folio, false);
 		spin_unlock(vmf->ptl);
 		count_vm_event(THP_FAULT_ALLOC);
 		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
@@ -2953,7 +2977,7 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
 	return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
 }

-static void remap_page(struct folio *folio, unsigned long nr)
+static void remap_page(struct folio *folio, unsigned long nr, int flags)
 {
 	int i = 0;

@@ -2961,7 +2985,7 @@ static void remap_page(struct folio *folio, unsigned long nr)
 	if (!folio_test_anon(folio))
 		return;
 	for (;;) {
-		remove_migration_ptes(folio, folio, true);
+		remove_migration_ptes(folio, folio, RMP_LOCKED | flags);
 		i += folio_nr_pages(folio);
 		if (i >= nr)
 			break;
@@ -3314,7 +3338,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,

 	if (nr_dropped)
 		shmem_uncharge(head->mapping->host, nr_dropped);
-	remap_page(folio, nr);
+	remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0);

 	for (i = 0; i < nr; i++) {
 		struct page *subpage = folio_dst_page(folio, i);
@@ -3376,8 +3400,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 	struct folio *folio = page_folio(page);
 	struct deferred_split *ds_queue = get_deferred_split_queue(folio);
 	XA_STATE(xas, &folio->mapping->i_pages, folio->index);
-	struct anon_vma *anon_vma = NULL;
+	bool is_anon = folio_test_anon(folio);
 	struct address_space *mapping = NULL;
+	struct anon_vma *anon_vma = NULL;
 	int extra_pins, ret;
 	pgoff_t end;
 	bool is_hzp;
@@ -3394,7 +3419,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 	if (folio_test_writeback(folio))
 		return -EBUSY;

-	if (folio_test_anon(folio)) {
+	if (is_anon) {
 		/*
 		 * The caller does not necessarily hold an mmap_lock that would
 		 * prevent the anon_vma disappearing so we first we take a
@@ -3495,6 +3520,14 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 	if (folio_order(folio) > 1 &&
 	    !list_empty(&folio->_deferred_list)) {
 		ds_queue->split_queue_len--;
+		if (folio_test_partially_mapped(folio))
+			__folio_clear_partially_mapped(folio);
+		/*
+		* Reinitialize page_deferred_list after removing the
+		* page from the split_queue, otherwise a subsequent
+		* split will see list corruption when checking the
+		* page_deferred_list.
+		*/
 		list_del_init(&folio->_deferred_list);
 	}
 	spin_unlock(&ds_queue->split_queue_lock);
@@ -3522,7 +3555,7 @@ unfreeze:
 		folio_ref_unfreeze(folio, 1 + extra_pins);
 remap:
 		free_dst_pages(folio);
-		remap_page(folio, folio_nr_pages(folio));
+		remap_page(folio, folio_nr_pages(folio), 0);
 	}

 out_unlock:
@@ -3572,6 +3605,8 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
 	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
 	if (!list_empty(&folio->_deferred_list)) {
 		ds_queue->split_queue_len--;
+		if (folio_test_partially_mapped(folio))
+			__folio_clear_partially_mapped(folio);
 		list_del_init(&folio->_deferred_list);
 		unqueued = true;
 	}
@@ -3580,7 +3615,8 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
 	return unqueued;	/* useful for debug warnings */
 }

-void deferred_split_folio(struct folio *folio)
+/* partially_mapped=false won't clear PG_partially_mapped folio flag */
+void deferred_split_folio(struct folio *folio, bool partially_mapped)
 {
 	struct deferred_split *ds_queue = get_deferred_split_queue(folio);
 #ifdef CONFIG_MEMCG
@@ -3595,6 +3631,9 @@ void deferred_split_folio(struct folio *folio)
 	if (folio_order(folio) <= 1)
 		return;

+	if (!partially_mapped && !split_underused_thp)
+		return;
+
 	/*
 	 * Exclude swapcache: originally to avoid a corrupt deferred split
 	 * queue. Nowadays that is fully prevented by mem_cgroup_swapout();
@@ -3605,13 +3644,20 @@ void deferred_split_folio(struct folio *folio)
 	if (folio_test_swapcache(folio))
 		return;

-	if (!list_empty(&folio->_deferred_list))
-		return;
-
 	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+	if (partially_mapped) {
+		if (!folio_test_partially_mapped(folio)) {
+			__folio_set_partially_mapped(folio);
+			if (folio_test_pmd_mappable(folio))
+				count_vm_event(THP_DEFERRED_SPLIT_PAGE);
+			count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
+
+		}
+	} else {
+		/* partially mapped folios cannot become non-partially mapped */
+		VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
+	}
 	if (list_empty(&folio->_deferred_list)) {
-		count_vm_event(THP_DEFERRED_SPLIT_PAGE);
-		count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
 		list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
 		ds_queue->split_queue_len++;
 #ifdef CONFIG_MEMCG
@@ -3640,6 +3686,39 @@ static unsigned long deferred_split_count(struct shrinker *shrink,
 	return READ_ONCE(ds_queue->split_queue_len);
 }

+static bool thp_underused(struct folio *folio)
+{
+	int num_zero_pages = 0, num_filled_pages = 0;
+	void *kaddr;
+	int i;
+
+	if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1)
+		return false;
+
+	for (i = 0; i < folio_nr_pages(folio); i++) {
+		kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
+		if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
+			num_zero_pages++;
+			if (num_zero_pages > khugepaged_max_ptes_none) {
+				kunmap_local(kaddr);
+				return true;
+			}
+		} else {
+			/*
+			 * Another path for early exit once the number
+			 * of non-zero filled pages exceeds threshold.
+			 */
+			num_filled_pages++;
+			if (num_filled_pages >= HPAGE_PMD_NR - khugepaged_max_ptes_none) {
+				kunmap_local(kaddr);
+				return false;
+			}
+		}
+		kunmap_local(kaddr);
+	}
+	return false;
+}
+
 static unsigned long deferred_split_scan(struct shrinker *shrink,
 		struct shrink_control *sc)
 {
@@ -3647,8 +3726,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
 	struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
 	unsigned long flags;
 	LIST_HEAD(list);
-	struct folio *folio, *next;
-	int split = 0;
+	struct folio *folio, *next, *prev = NULL;
+	int split = 0, removed = 0;

 #ifdef CONFIG_MEMCG
 	if (sc->memcg)
@@ -3663,6 +3742,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
 			list_move(&folio->_deferred_list, &list);
 		} else {
 			/* We lost race with folio_put() */
+			if (folio_test_partially_mapped(folio))
+				__folio_clear_partially_mapped(folio);
 			list_del_init(&folio->_deferred_list);
 			ds_queue->split_queue_len--;
 		}
@@ -3672,20 +3753,55 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
 	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);

 	list_for_each_entry_safe(folio, next, &list, _deferred_list) {
+		bool did_split = false;
+		bool underused = false;
+
+		if (!folio_test_partially_mapped(folio)) {
+			underused = thp_underused(folio);
+			if (!underused)
+				goto next;
+		}
 		if (!folio_trylock(folio))
 			goto next;
-		/* split_huge_page() removes page from list on success */
-		if (!split_folio(folio))
+		if (!split_folio(folio)) {
+			did_split = true;
 			split++;
+		}
 		folio_unlock(folio);
 next:
-		folio_put(folio);
+		/*
+		 * split_folio() removes folio from list on success.
+		 * Only add back to the queue if folio is partially mapped.
+		 * If thp_underused returns false, or if split_folio fails
+		 * in the case it was underused, then consider it used and
+		 * don't add it back to split_queue.
+		 */
+		if (did_split) {
+			; /* folio already removed from list */
+		} else if (!folio_test_partially_mapped(folio)) {
+			list_del_init(&folio->_deferred_list);
+			removed++;
+		} else {
+			/*
+			 * That unlocked list_del_init() above would be unsafe,
+			 * unless its folio is separated from any earlier folios
+			 * left on the list (which may be concurrently unqueued)
+			 * by one safe folio with refcount still raised.
+			 */
+			swap(folio, prev);
+		}
+		if (folio)
+			folio_put(folio);
 	}

 	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
 	list_splice_tail(&list, &ds_queue->split_queue);
+	ds_queue->split_queue_len -= removed;
 	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);

+	if (prev)
+		folio_put(prev);
+
 	/*
 	 * Stop shrinker if we didn't split any page, but the queue is empty.
 	 * This can happen if pages were freed under us.
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -470,7 +470,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
 #define K(x) ((x) << (PAGE_SHIFT-10))

 extern char * const zone_names[MAX_NR_ZONES];
-extern unsigned long free_highatomics[MAX_NR_ZONES];
+extern unsigned long nr_free_highatomic[MAX_NR_ZONES];

 /* perform sanity checks on struct pages being allocated or freed */
 DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
@@ -721,8 +721,9 @@ extern void post_alloc_hook(struct page *page, unsigned int order,
 					gfp_t gfp_flags);
 extern int user_min_free_kbytes;

-extern void free_unref_page(struct page *page, unsigned int order);
-extern void free_unref_page_list(struct list_head *list);
+void free_unref_page(struct page *page, unsigned int order);
+void free_unref_folios(struct folio_batch *fbatch);
+void free_unref_page_list(struct list_head *list);

 extern void zone_pcp_reset(struct zone *zone);
 extern void zone_pcp_disable(struct zone *zone);
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -84,7 +84,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
 *
 * Note that these are only respected if collapse was initiated by khugepaged.
 */
-static unsigned int khugepaged_max_ptes_none __read_mostly;
+unsigned int khugepaged_max_ptes_none __read_mostly;
 static unsigned int khugepaged_max_ptes_swap __read_mostly;
 static unsigned int khugepaged_max_ptes_shared __read_mostly;

@@ -1218,6 +1218,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	pgtable_trans_huge_deposit(mm, pmd, pgtable);
 	set_pmd_at(mm, address, pmd, _pmd);
 	update_mmu_cache_pmd(vma, address, pmd);
+	deferred_split_folio(folio, false);
 	spin_unlock(pmd_ptl);

 	hpage = NULL;
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -33,6 +33,7 @@
 #include <linux/shmem_fs.h>
 #include <linux/hugetlb.h>
 #include <linux/pagemap.h>
+#include <linux/pagevec.h>
 #include <linux/vm_event_item.h>
 #include <linux/smp.h>
 #include <linux/page-flags.h>
@@ -95,6 +96,9 @@ static bool cgroup_memory_nokmem __ro_after_init;
 /* BPF memory accounting disabled? */
 static bool cgroup_memory_nobpf __ro_after_init;

+static struct kmem_cache *memcg_cachep;
+static struct kmem_cache *memcg_pn_cachep;
+
 #ifdef CONFIG_CGROUP_WRITEBACK
 static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
 #endif
@@ -5384,7 +5388,8 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
 {
 	struct mem_cgroup_per_node *pn;

-	pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node);
+	pn = kmem_cache_alloc_node(memcg_pn_cachep, GFP_KERNEL | __GFP_ZERO,
+				   node);
 	if (!pn)
 		return 1;

@@ -5440,7 +5445,7 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
 	int __maybe_unused i;
 	long error = -ENOMEM;

-	memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL);
+	memcg = kmem_cache_zalloc(memcg_cachep, GFP_KERNEL);
 	if (!memcg)
 		return ERR_PTR(error);

@@ -6017,8 +6022,6 @@ int mem_cgroup_move_account(struct folio *folio,
 	css_get(&to->css);
 	css_put(&from->css);

-	/* Warning should never happen, so don't worry about refcount non-0 */
-	WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
 	folio->memcg_data = (unsigned long)to;

 	__folio_memcg_unlock(from);
@@ -6389,9 +6392,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 	enum mc_target_type target_type;
 	union mc_target target;
 	struct folio *folio;
-	bool tried_split_before = false;

-retry_pmd:
 	ptl = pmd_trans_huge_lock(pmd, vma);
 	if (ptl) {
 		if (mc.precharge < HPAGE_PMD_NR) {
@@ -6401,27 +6402,6 @@ retry_pmd:
 		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
 		if (target_type == MC_TARGET_PAGE) {
 			folio = target.folio;
-			/*
-			 * Deferred split queue locking depends on memcg,
-			 * and unqueue is unsafe unless folio refcount is 0:
-			 * split or skip if on the queue? first try to split.
-			 */
-			if (!list_empty(&folio->_deferred_list)) {
-				spin_unlock(ptl);
-				if (!tried_split_before)
-					split_folio(folio);
-				folio_unlock(folio);
-				folio_put(folio);
-				if (tried_split_before)
-					return 0;
-				tried_split_before = true;
-				goto retry_pmd;
-			}
-			/*
-			 * So long as that pmd lock is held, the folio cannot
-			 * be racily added to the _deferred_list, because
-			 * __folio_remove_rmap() will find !partially_mapped.
-			 */
 			if (folio_isolate_lru(folio)) {
 				if (!mem_cgroup_move_account(folio, true,
 							     mc.from, mc.to)) {
@@ -7418,6 +7398,18 @@ void __mem_cgroup_uncharge_list(struct list_head *page_list)
 		uncharge_batch(&ug);
 }

+void __mem_cgroup_uncharge_folios(struct folio_batch *folios)
+{
+	struct uncharge_gather ug;
+	unsigned int i;
+
+	uncharge_gather_clear(&ug);
+	for (i = 0; i < folios->nr; i++)
+		uncharge_folio(folios->folios[i], &ug);
+	if (ug.memcg)
+		uncharge_batch(&ug);
+}
+
 /**
 * mem_cgroup_replace_folio - Charge a folio's replacement.
 * @old: Currently circulating folio.
@@ -7606,15 +7598,16 @@ static int __init cgroup_memory(char *s)
 __setup("cgroup.memory=", cgroup_memory);

 /*
- * subsys_initcall() for memory controller.
+ * Memory controller init before cgroup_init() initialize root_mem_cgroup.
 *
 * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
 * context because of lock dependencies (cgroup_lock -> cpu hotplug) but
 * basically everything that doesn't depend on a specific mem_cgroup structure
 * should be initialized from here.
 */
-static int __init mem_cgroup_init(void)
+int __init mem_cgroup_init(void)
 {
+	unsigned int memcg_size;
 	int cpu, node;

 	/*
@@ -7632,6 +7625,13 @@ static int __init mem_cgroup_init(void)
 		INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
 			  drain_local_stock);

+	memcg_size = struct_size_t(struct mem_cgroup, nodeinfo, nr_node_ids);
+	memcg_cachep = kmem_cache_create("mem_cgroup", memcg_size, 0,
+					 SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL);
+
+	memcg_pn_cachep = KMEM_CACHE(mem_cgroup_per_node,
+				     SLAB_PANIC | SLAB_HWCACHE_ALIGN);
+
 	for_each_node(node) {
 		struct mem_cgroup_tree_per_node *rtpn;

@@ -7645,7 +7645,6 @@ static int __init mem_cgroup_init(void)

 	return 0;
 }
-subsys_initcall(mem_cgroup_init);

 #ifdef CONFIG_SWAP
 static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -19,6 +19,8 @@
 #include <linux/mempool.h>
 #include <linux/writeback.h>
 #include "slab.h"
+#undef CREATE_TRACE_POINTS
+#include <trace/hooks/mm.h>

 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON)
 static void poison_error(mempool_t *pool, void *element, size_t size,
@@ -383,6 +385,7 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
 	unsigned long flags;
 	wait_queue_entry_t wait;
 	gfp_t gfp_temp;
+	bool skip_wait = false;

 	VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
 	might_alloc(gfp_mask);
@@ -428,6 +431,11 @@ repeat_alloc:
 		spin_unlock_irqrestore(&pool->lock, flags);
 		return NULL;
 	}
+	trace_android_vh_mempool_alloc_skip_wait(&gfp_temp, &skip_wait);
+	if (skip_wait) {
+		spin_unlock_irqrestore(&pool->lock, flags);
+		goto repeat_alloc;
+	}

 	/* Let's wait for someone else to return an element to @pool */
 	init_wait(&wait);
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -182,13 +182,57 @@ void putback_movable_pages(struct list_head *l)
 }
 EXPORT_SYMBOL_GPL(putback_movable_pages);

+static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
+					  struct folio *folio,
+					  unsigned long idx)
+{
+	struct page *page = folio_page(folio, idx);
+	bool contains_data;
+	pte_t newpte;
+	void *addr;
+
+	VM_BUG_ON_PAGE(PageCompound(page), page);
+	VM_BUG_ON_PAGE(!PageAnon(page), page);
+	VM_BUG_ON_PAGE(!PageLocked(page), page);
+	VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
+
+	if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) ||
+	    mm_forbids_zeropage(pvmw->vma->vm_mm))
+		return false;
+
+	/*
+	 * The pmd entry mapping the old thp was flushed and the pte mapping
+	 * this subpage has been non present. If the subpage is only zero-filled
+	 * then map it to the shared zeropage.
+	 */
+	addr = kmap_local_page(page);
+	contains_data = memchr_inv(addr, 0, PAGE_SIZE);
+	kunmap_local(addr);
+
+	if (contains_data)
+		return false;
+
+	newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
+					pvmw->vma->vm_page_prot));
+	set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte);
+
+	dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
+	return true;
+}
+
+struct rmap_walk_arg {
+	struct folio *folio;
+	bool map_unused_to_zeropage;
+};
+
 /*
 * Restore a potential migration pte to a working pte entry
 */
 static bool remove_migration_pte(struct folio *dst,
 		struct vm_area_struct *vma, unsigned long addr, void *arg)
 {
-	struct folio *src = arg;
+	struct rmap_walk_arg *rmap_walk_arg = arg;
+	struct folio *src = rmap_walk_arg->folio;
 	DEFINE_FOLIO_VMA_WALK(pvmw, src, vma, addr, PVMW_SYNC | PVMW_MIGRATION);

 	while (page_vma_mapped_walk(&pvmw)) {
@@ -228,6 +272,9 @@ static bool remove_migration_pte(struct folio *dst,
 			continue;
 		}
 #endif
+		if (rmap_walk_arg->map_unused_to_zeropage &&
+		    try_to_map_unused_to_zeropage(&pvmw, folio, idx))
+			continue;

 		folio_get(folio);
 		pte = mk_pte(page, READ_ONCE(vma->vm_page_prot));
@@ -303,14 +350,21 @@ static bool remove_migration_pte(struct folio *dst,
 * Get rid of all migration entries and replace them by
 * references to the indicated page.
 */
-void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked)
+void remove_migration_ptes(struct folio *src, struct folio *dst, int flags)
 {
-	struct rmap_walk_control rwc = {
-		.rmap_one = remove_migration_pte,
-		.arg = src,
+	struct rmap_walk_arg rmap_walk_arg = {
+		.folio = src,
+		.map_unused_to_zeropage = flags & RMP_USE_SHARED_ZEROPAGE,
 	};

-	if (locked)
+	struct rmap_walk_control rwc = {
+		.rmap_one = remove_migration_pte,
+		.arg = &rmap_walk_arg,
+	};
+
+	VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src != dst), src);
+
+	if (flags & RMP_LOCKED)
 		rmap_walk_locked(dst, &rwc);
 	else
 		rmap_walk(dst, &rwc);
@@ -461,7 +515,8 @@ int folio_migrate_mapping(struct address_space *mapping,
 	}

 	/* Take off deferred split queue while frozen and memcg set */
-	folio_unqueue_deferred_split(folio);
+	if (folio_test_large(folio) && folio_test_large_rmappable(folio))
+		folio_unqueue_deferred_split(folio);

 	/*
 	 * Now we know that no one else is looking at the folio:
@@ -933,7 +988,7 @@ static int writeout(struct address_space *mapping, struct folio *folio)
 	 * At this point we know that the migration attempt cannot
 	 * be successful.
 	 */
-	remove_migration_ptes(folio, folio, false);
+	remove_migration_ptes(folio, folio, 0);

 	rc = mapping->a_ops->writepage(&folio->page, &wbc);

@@ -1096,7 +1151,7 @@ static void migrate_folio_undo_src(struct folio *src,
 				   struct list_head *ret)
 {
 	if (page_was_mapped)
-		remove_migration_ptes(src, src, false);
+		remove_migration_ptes(src, src, 0);
 	/* Drop an anon_vma reference if we took one */
 	if (anon_vma)
 		put_anon_vma(anon_vma);
@@ -1335,7 +1390,7 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
 		lru_add_drain();

 	if (old_page_state & PAGE_WAS_MAPPED)
-		remove_migration_ptes(src, dst, false);
+		remove_migration_ptes(src, dst, 0);

 out_unlock_both:
 	folio_unlock(dst);
@@ -1474,7 +1529,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,

 	if (page_was_mapped)
 		remove_migration_ptes(src,
-			rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
+			rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);

 unlock_put_anon:
 	folio_unlock(dst);
@@ -1702,6 +1757,35 @@ static int migrate_pages_batch(struct list_head *from,

 			cond_resched();

+			/*
+			 * The rare folio on the deferred split list should
+			 * be split now. It should not count as a failure:
+			 * but increment nr_failed because, without doing so,
+			 * migrate_pages() may report success with (split but
+			 * unmigrated) pages still on its fromlist; whereas it
+			 * always reports success when its fromlist is empty.
+			 *
+			 * Only check it without removing it from the list.
+			 * Since the folio can be on deferred_split_scan()
+			 * local list and removing it can cause the local list
+			 * corruption. Folio split process below can handle it
+			 * with the help of folio_ref_freeze().
+			 *
+			 * nr_pages > 2 is needed to avoid checking order-1
+			 * page cache folios. They exist, in contrast to
+			 * non-existent order-1 anonymous folios, and do not
+			 * use _deferred_list.
+			 */
+			if (nr_pages > 2 &&
+			   !list_empty(&folio->_deferred_list) &&
+			   folio_test_partially_mapped(folio)) {
+				if (!try_split_folio(folio, split_folios, mode)) {
+					nr_failed++;
+					stats->nr_thp_split += is_thp;
+					continue;
+				}
+			}
+
 			/*
 			 * Large folio migration might be unsupported or
 			 * the allocation might be failed so we should retry
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -422,7 +422,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
 			continue;

 		folio = page_folio(page);
-		remove_migration_ptes(folio, folio, false);
+		remove_migration_ptes(folio, folio, 0);

 		src_pfns[i] = 0;
 		folio_unlock(folio);
@@ -840,7 +840,7 @@ void migrate_device_finalize(unsigned long *src_pfns,

 		src = page_folio(page);
 		dst = page_folio(newpage);
-		remove_migration_ptes(src, dst, false);
+		remove_migration_ptes(src, dst, 0);
 		folio_unlock(src);

 		if (is_zone_device_page(page))
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -208,8 +208,7 @@ static void mlock_folio_batch(struct folio_batch *fbatch)

 	if (lruvec)
 		unlock_page_lruvec_irq(lruvec);
-	folios_put(fbatch->folios, folio_batch_count(fbatch));
-	folio_batch_reinit(fbatch);
+	folios_put(fbatch);
 }

 void mlock_drain_local(void)
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1558,7 +1558,7 @@ static inline void setup_usemap(struct zone *zone) {}
 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
 void __init set_pageblock_order(void)
 {
-	unsigned int order = MAX_ORDER;
+	unsigned int order = PAGE_BLOCK_ORDER;

 	/* Check that pageblock_nr_pages has not already been setup */
 	if (pageblock_order)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -33,6 +33,7 @@
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
+#include <linux/pagevec.h>
 #include <linux/memory_hotplug.h>
 #include <linux/nodemask.h>
 #include <linux/vmstat.h>
@@ -323,7 +324,7 @@ const char * const migratetype_names[MIGRATE_TYPES] = {
 #endif
 };

-unsigned long free_highatomics[MAX_NR_ZONES] = {0};
+unsigned long nr_free_highatomic[MAX_NR_ZONES] = {0};

 int min_free_kbytes = 1024;
 int user_min_free_kbytes = -1;
@@ -770,8 +771,8 @@ static inline void account_freepages(struct zone *zone, int nr_pages,
 	if (is_migrate_cma(migratetype))
 		__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
 	else if (is_migrate_highatomic(migratetype))
-		WRITE_ONCE(free_highatomics[zone_idx(zone)],
-			   free_highatomics[zone_idx(zone)] + nr_pages);
+		WRITE_ONCE(nr_free_highatomic[zone_idx(zone)],
+			   nr_free_highatomic[zone_idx(zone)] + nr_pages);
 }

 /* Used for pages not on another list */
@@ -921,7 +922,6 @@ static inline void __free_one_page(struct page *page,
 	VM_BUG_ON_PAGE(page->flags & check_flags, page);

 	VM_BUG_ON(migratetype == -1);
-
 	VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
 	VM_BUG_ON_PAGE(bad_range(zone, page), page);

@@ -1237,6 +1237,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
 				}
 			}
 			(page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+			trace_android_vh_mm_free_page(page + i);
 		}
 	}
 	if (PageMappingFlags(page))
@@ -1252,6 +1253,7 @@ static __always_inline bool free_pages_prepare(struct page *page,

 	page_cpupid_reset_last(page);
 	page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+	trace_android_vh_mm_free_page(page);
 	reset_page_owner(page, order);
 	free_page_pinner(page, order);
 	page_table_check_free(page, order);
@@ -1372,7 +1374,6 @@ static void free_one_page(struct zone *zone, struct page *page,
 static void __free_pages_ok(struct page *page, unsigned int order,
 			    fpi_t fpi_flags)
 {
-	unsigned long flags;
 	int migratetype;
 	unsigned long pfn = page_to_pfn(page);
 	struct zone *zone = page_zone(page);
@@ -1392,21 +1393,17 @@ skip_prepare:
 			fpi_flags, &skip_free_pages_ok);
 	if (skip_free_pages_ok)
 		return;
-
-	spin_lock_irqsave(&zone->lock, flags);
+	/*
+	 * Calling get_pfnblock_migratetype() without spin_lock_irqsave() here
+	 * is used to avoid calling get_pfnblock_migratetype() under the lock.
+	 * This will reduce the lock holding time.
+	 */
 	migratetype = get_pfnblock_migratetype(page, pfn);
 	trace_android_vh_free_unref_page_bypass(page, order, migratetype, &skip_free_unref_page);
-	if (skip_free_unref_page) {
-		spin_unlock_irqrestore(&zone->lock, flags);
+	if (skip_free_unref_page)
 		return;
-	}

-	if (unlikely(has_isolate_pageblock(zone) ||
-		is_migrate_isolate(migratetype))) {
-		migratetype = get_pfnblock_migratetype(page, pfn);
-	}
-	__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
-	spin_unlock_irqrestore(&zone->lock, flags);
+	free_one_page(zone, page, pfn, order, fpi_flags);

 	__count_vm_events(PGFREE, 1 << order);
 }
@@ -2249,8 +2246,8 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
 	struct zone *zone;
 	struct page *page;
 	int order;
-	int ret;
 	bool skip_unreserve_highatomic = false;
+	int ret;

 	for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
 								ac->nodemask) {
@@ -2765,7 +2762,7 @@ void free_unref_page(struct page *page, unsigned int order)
 		return;
 	if (unlikely(migratetype > MIGRATE_RECLAIMABLE)) {
 		if (unlikely(is_migrate_isolate(migratetype))) {
-			free_one_page(page_zone(page), page, pfn, order,  FPI_NONE);
+			free_one_page(page_zone(page), page, pfn, order, FPI_NONE);
 			return;
 		}
 #ifdef CONFIG_CMA
@@ -2781,64 +2778,65 @@ void free_unref_page(struct page *page, unsigned int order)
 		free_unref_page_commit(zone, pcp, page, migratetype, order);
 		pcp_spin_unlock(pcp);
 	} else {
-		free_one_page(zone, page, pfn, order,  FPI_NONE);
+		free_one_page(zone, page, pfn, order, FPI_NONE);
 	}
 	pcp_trylock_finish(UP_flags);
 }

 /*
- * Free a list of 0-order pages
+ * Free a batch of folios
 */
-void free_unref_page_list(struct list_head *list)
+void free_unref_folios(struct folio_batch *folios)
 {
 	unsigned long __maybe_unused UP_flags;
-	struct page *page, *next;
 	struct per_cpu_pages *pcp = NULL;
 	struct zone *locked_zone = NULL;
-	int batch_count = 0;
-	int migratetype;
-	bool skip_free = false;
+	int i, j;

-	/* Prepare pages for freeing */
-	list_for_each_entry_safe(page, next, list, lru) {
-		unsigned long pfn = page_to_pfn(page);
-		if (!free_pages_prepare(page, 0, FPI_NONE)) {
-			list_del(&page->lru);
+	/* Prepare folios for freeing */
+	for (i = 0, j = 0; i < folios->nr; i++) {
+		struct folio *folio = folios->folios[i];
+		unsigned long pfn = folio_pfn(folio);
+		unsigned int order = folio_order(folio);
+
+		if (order > 0 && folio_test_large_rmappable(folio))
+			folio_unqueue_deferred_split(folio);
+		if (!free_pages_prepare(&folio->page, order, FPI_NONE))
 			continue;
-		}
-
 		/*
-		 * Free isolated pages directly to the allocator, see
-		 * comment in free_unref_page.
+		 * Free orders not handled on the PCP directly to the
+		 * allocator.
 		 */
-		migratetype = get_pfnblock_migratetype(page, pfn);
-		if (unlikely(is_migrate_isolate(migratetype))) {
-			list_del(&page->lru);
-			free_one_page(page_zone(page), page, pfn, 0, FPI_NONE);
+		if (!pcp_allowed_order(order)) {
+			free_one_page(folio_zone(folio), &folio->page,
+				      pfn, order, FPI_NONE);
 			continue;
 		}
+		folio->private = (void *)(unsigned long)order;
+		if (j != i)
+			folios->folios[j] = folio;
+		j++;
 	}
+	folios->nr = j;

-	trace_android_vh_free_unref_page_list_bypass(list, &skip_free);
-	if (skip_free)
-		return;
+	for (i = 0; i < folios->nr; i++) {
+		struct folio *folio = folios->folios[i];
+		struct zone *zone = folio_zone(folio);
+		unsigned long pfn = folio_pfn(folio);
+		unsigned int order = (unsigned long)folio->private;
+		int migratetype;

-	list_for_each_entry_safe(page, next, list, lru) {
-		struct zone *zone = page_zone(page);
-		unsigned long pfn = page_to_pfn(page);
+		folio->private = NULL;
+		migratetype = get_pfnblock_migratetype(&folio->page, pfn);

-		list_del(&page->lru);
-		migratetype = get_pfnblock_migratetype(page, pfn);
-
-		/*
-		 * Either different zone requiring a different pcp lock or
-		 * excessive lock hold times when freeing a large list of
-		 * pages.
-		 */
-		if (zone != locked_zone || batch_count == SWAP_CLUSTER_MAX) {
+		/* Different zone requires a different pcp lock */
+		if (zone != locked_zone ||
+		    is_migrate_isolate(migratetype)) {
 			if (pcp) {
 				pcp_spin_unlock(pcp);
 				pcp_trylock_finish(UP_flags);
+				locked_zone = NULL;
+				pcp = NULL;
 			}

 			/*
@@ -2846,24 +2844,21 @@ void free_unref_page_list(struct list_head *list)
 			 * allocator, see comment in free_unref_page.
 			 */
 			if (is_migrate_isolate(migratetype)) {
-				free_one_page(zone, page, page_to_pfn(page),
-					      0,  FPI_NONE);
+				free_one_page(zone, &folio->page, pfn,
+					      order, FPI_NONE);
 				continue;
- 			}
-
-			batch_count = 0;
+			}

 			/*
-			 * trylock is necessary as pages may be getting freed
+			 * trylock is necessary as folios may be getting freed
 			 * from IRQ or SoftIRQ context after an IO completion.
 			 */
 			pcp_trylock_prepare(UP_flags);
 			pcp = pcp_spin_trylock(zone->per_cpu_pageset);
 			if (unlikely(!pcp)) {
 				pcp_trylock_finish(UP_flags);
-				free_one_page(zone, page, pfn,
-					      0, FPI_NONE);
-				locked_zone = NULL;
+				free_one_page(zone, &folio->page, pfn,
+					      order, FPI_NONE);
 				continue;
 			}
 			locked_zone = zone;
@@ -2880,15 +2875,39 @@ void free_unref_page_list(struct list_head *list)
 				migratetype = MIGRATE_MOVABLE;
 		}

-		trace_mm_page_free_batched(page);
-		free_unref_page_commit(zone, pcp, page, migratetype, 0);
-		batch_count++;
+		trace_mm_page_free_batched(&folio->page);
+		free_unref_page_commit(zone, pcp, &folio->page, migratetype,
+				order);
 	}

 	if (pcp) {
 		pcp_spin_unlock(pcp);
 		pcp_trylock_finish(UP_flags);
 	}
+	folio_batch_reinit(folios);
+}
+
+void free_unref_page_list(struct list_head *list)
+{
+	struct folio_batch fbatch;
+	bool skip_free = false;
+
+	trace_android_vh_free_unref_page_list_bypass(list, &skip_free);
+	if (skip_free)
+		return;
+
+	folio_batch_init(&fbatch);
+	while (!list_empty(list)) {
+		struct folio *folio = list_first_entry(list, struct folio, lru);
+
+		list_del(&folio->lru);
+		if (folio_batch_add(&fbatch, folio) > 0)
+			continue;
+		free_unref_folios(&fbatch);
+	}
+
+	if (fbatch.nr)
+		free_unref_folios(&fbatch);
 }

 /*
@@ -3216,7 +3235,7 @@ static inline long __zone_watermark_unusable_free(struct zone *z,
 	 * watermark then subtract the free pages reserved for highatomic.
 	 */
 	if (likely(!(alloc_flags & ALLOC_RESERVES)))
-		unusable_free += READ_ONCE(free_highatomics[zone_idx(z)]);
+		unusable_free += READ_ONCE(nr_free_highatomic[zone_idx(z)]);

 #ifdef CONFIG_CMA
 	/* If allocation can't use CMA areas don't use free CMA pages */
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -417,9 +417,9 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,

 				ret = __alloc_contig_migrate_range(&cc, head_pfn,
 							head_pfn + nr_pages, page_mt);
+
 				if (ret)
 					goto failed;
-
 				pfn = head_pfn + nr_pages;
 				continue;
 			}
--- a/mm/pgsize_migration.c
+++ b/mm/pgsize_migration.c
@@ -270,6 +270,9 @@ static const struct vm_operations_struct pad_vma_ops = {
 	.name = pad_vma_name,
 };

+/* Defined in kernel/fork.c */
+extern struct kmem_cache *vm_area_cachep;
+
 /*
 * Returns a new VMA representing the padding in @vma;
 * returns NULL if no padding in @vma or allocation failed.
@@ -281,7 +284,7 @@ static struct vm_area_struct *get_pad_vma(struct vm_area_struct *vma)
 	if (!is_pgsize_migration_enabled() || !(vma->vm_flags & VM_PAD_MASK))
 		return NULL;

-	pad = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
+	pad = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 	if (!pad) {
 		pr_warn("Page size migration: Failed to allocate padding VMA");
 		return NULL;
@@ -347,7 +350,7 @@ void show_map_pad_vma(struct vm_area_struct *vma, struct seq_file *m,
 	else
 		((show_pad_maps_fn)func)(m, pad);

-	kfree(pad);
+	kmem_cache_free(vm_area_cachep, pad);
 }

 /*
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1599,8 +1599,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
 		 * Check partially_mapped first to ensure it is a large folio.
 		 */
 		if (folio_test_anon(folio) && partially_mapped &&
-		    list_empty(&folio->_deferred_list))
-			deferred_split_folio(folio);
+		    !folio_test_partially_mapped(folio))
+			deferred_split_folio(folio, true);
 	}

 	/*
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -342,7 +342,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
 			K(low_wmark_pages(zone)),
 			K(high_wmark_pages(zone)),
 			K(zone->nr_reserved_highatomic),
-			K(free_highatomics[zone_idx(zone)]),
+			K(nr_free_highatomic[zone_idx(zone)]),
 			K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
 			K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
 			K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -77,26 +77,33 @@ static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = {
 	.lock = INIT_LOCAL_LOCK(lock),
 };

+static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp,
+		unsigned long *flagsp)
+{
+	if (folio_test_lru(folio)) {
+		folio_lruvec_relock_irqsave(folio, lruvecp, flagsp);
+		lruvec_del_folio(*lruvecp, folio);
+		__folio_clear_lru_flags(folio);
+	}
+}
+
 /*
 * This path almost never happens for VM activity - pages are normally freed
 * in batches.  But it gets used by networking - and for compound pages.
 */
-static void __page_cache_release(struct folio *folio)
+static void page_cache_release(struct folio *folio)
 {
-	if (folio_test_lru(folio)) {
-		struct lruvec *lruvec;
-		unsigned long flags;
+	struct lruvec *lruvec = NULL;
+	unsigned long flags;

-		lruvec = folio_lruvec_lock_irqsave(folio, &flags);
-		lruvec_del_folio(lruvec, folio);
-		__folio_clear_lru_flags(folio);
+	__page_cache_release(folio, &lruvec, &flags);
+	if (lruvec)
 		unlock_page_lruvec_irqrestore(lruvec, flags);
-	}
 }

 static void __folio_put_small(struct folio *folio)
 {
-	__page_cache_release(folio);
+	page_cache_release(folio);
 	mem_cgroup_uncharge(folio);
 	free_unref_page(&folio->page, 0);
 }
@@ -110,7 +117,7 @@ static void __folio_put_large(struct folio *folio)
 	 * be called for hugetlb (it has a separate hugetlb_cgroup.)
 	 */
 	if (!folio_test_hugetlb(folio))
-		__page_cache_release(folio);
+		page_cache_release(folio);
 	destroy_large_folio(folio);
 }

@@ -133,22 +140,25 @@ EXPORT_SYMBOL(__folio_put);
 */
 void put_pages_list(struct list_head *pages)
 {
+	struct folio_batch fbatch;
 	struct folio *folio, *next;

+	folio_batch_init(&fbatch);
 	list_for_each_entry_safe(folio, next, pages, lru) {
-		if (!folio_put_testzero(folio)) {
-			list_del(&folio->lru);
+		if (!folio_put_testzero(folio))
 			continue;
-		}
 		if (folio_test_large(folio)) {
-			list_del(&folio->lru);
 			__folio_put_large(folio);
 			continue;
 		}
 		/* LRU flag must be clear because it's passed using the lru */
+		if (folio_batch_add(&fbatch, folio) > 0)
+			continue;
+		free_unref_folios(&fbatch);
 	}

-	free_unref_page_list(pages);
+	if (fbatch.nr)
+		free_unref_folios(&fbatch);
 	INIT_LIST_HEAD(pages);
 }
 EXPORT_SYMBOL(put_pages_list);
@@ -170,7 +180,7 @@ static void lru_add_fn(struct lruvec *lruvec, struct folio *folio)
 	 * while the LRU lock is held.
 	 *
 	 * (That is not true of __page_cache_release(), and not necessarily
-	 * true of release_pages(): but those only clear the mlocked flag after
+	 * true of folios_put(): but those only clear the mlocked flag after
 	 * folio_put_testzero() has excluded any other users of the folio.)
 	 */
 	if (folio_evictable(folio)) {
@@ -208,7 +218,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
 		if (move_fn != lru_add_fn && !folio_test_clear_lru(folio))
 			continue;

-		lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags);
+		folio_lruvec_relock_irqsave(folio, &lruvec, &flags);
 		move_fn(lruvec, folio);

 		folio_set_lru(folio);
@@ -216,8 +226,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)

 	if (lruvec)
 		unlock_page_lruvec_irqrestore(lruvec, flags);
-	folios_put(fbatch->folios, folio_batch_count(fbatch));
-	folio_batch_reinit(fbatch);
+	folios_put(fbatch);
 }

 static void folio_batch_add_and_move(struct folio_batch *fbatch,
@@ -958,47 +967,29 @@ void lru_cache_disable(void)
 EXPORT_SYMBOL_GPL(lru_cache_disable);

 /**
- * release_pages - batched put_page()
- * @arg: array of pages to release
- * @nr: number of pages
+ * folios_put_refs - Reduce the reference count on a batch of folios.
+ * @folios: The folios.
+ * @refs: The number of refs to subtract from each folio.
 *
- * Decrement the reference count on all the pages in @arg.  If it
- * fell to zero, remove the page from the LRU and free it.
+ * Like folio_put(), but for a batch of folios.  This is more efficient
+ * than writing the loop yourself as it will optimise the locks which need
+ * to be taken if the folios are freed.  The folios batch is returned
+ * empty and ready to be reused for another batch; there is no need
+ * to reinitialise it.  If @refs is NULL, we subtract one from each
+ * folio refcount.
 *
- * Note that the argument can be an array of pages, encoded pages,
- * or folio pointers. We ignore any encoded bits, and turn any of
- * them into just a folio that gets free'd.
+ * Context: May be called in process or interrupt context, but not in NMI
+ * context.  May be called while holding a spinlock.
 */
-void release_pages(release_pages_arg arg, int nr)
+void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
 {
-	int i;
-	struct encoded_page **encoded = arg.encoded_pages;
-	LIST_HEAD(pages_to_free);
+	int i, j;
 	struct lruvec *lruvec = NULL;
 	unsigned long flags = 0;
-	unsigned int lock_batch;

-	for (i = 0; i < nr; i++) {
-		unsigned int nr_refs = 1;
-		struct folio *folio;
-
-		/* Turn any of the argument types into a folio */
-		folio = page_folio(encoded_page_ptr(encoded[i]));
-
-		/* Is our next entry actually "nr_pages" -> "nr_refs" ? */
-		if (unlikely(encoded_page_flags(encoded[i]) &
-			     ENCODED_PAGE_BIT_NR_PAGES_NEXT))
-			nr_refs = encoded_nr_pages(encoded[++i]);
-
-		/*
-		 * Make sure the IRQ-safe lock-holding time does not get
-		 * excessive with a continuous string of pages from the
-		 * same lruvec. The lock is held only if lruvec != NULL.
-		 */
-		if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) {
-			unlock_page_lruvec_irqrestore(lruvec, flags);
-			lruvec = NULL;
-		}
+	for (i = 0, j = 0; i < folios->nr; i++) {
+		struct folio *folio = folios->folios[i];
+		unsigned int nr_refs = refs ? refs[i] : 1;

 		if (is_huge_zero_page(&folio->page))
 			continue;
@@ -1018,34 +1009,73 @@ void release_pages(release_pages_arg arg, int nr)
 		if (!folio_ref_sub_and_test(folio, nr_refs))
 			continue;

-		if (folio_test_large(folio)) {
+		/* hugetlb has its own memcg */
+		if (folio_test_hugetlb(folio)) {
 			if (lruvec) {
 				unlock_page_lruvec_irqrestore(lruvec, flags);
 				lruvec = NULL;
 			}
-			__folio_put_large(folio);
+			free_huge_folio(folio);
 			continue;
 		}

-		if (folio_test_lru(folio)) {
-			struct lruvec *prev_lruvec = lruvec;
+		folio_unqueue_deferred_split(folio);
+		__page_cache_release(folio, &lruvec, &flags);

-			lruvec = folio_lruvec_relock_irqsave(folio, lruvec,
-									&flags);
-			if (prev_lruvec != lruvec)
-				lock_batch = 0;
-
-			lruvec_del_folio(lruvec, folio);
-			__folio_clear_lru_flags(folio);
-		}
-
-		list_add(&folio->lru, &pages_to_free);
+		if (j != i)
+			folios->folios[j] = folio;
+		j++;
 	}
 	if (lruvec)
 		unlock_page_lruvec_irqrestore(lruvec, flags);
+	if (!j) {
+		folio_batch_reinit(folios);
+		return;
+	}

-	mem_cgroup_uncharge_list(&pages_to_free);
-	free_unref_page_list(&pages_to_free);
+	folios->nr = j;
+	mem_cgroup_uncharge_folios(folios);
+	free_unref_folios(folios);
+}
+EXPORT_SYMBOL(folios_put_refs);
+
+/**
+ * release_pages - batched put_page()
+ * @arg: array of pages to release
+ * @nr: number of pages
+ *
+ * Decrement the reference count on all the pages in @arg.  If it
+ * fell to zero, remove the page from the LRU and free it.
+ *
+ * Note that the argument can be an array of pages, encoded pages,
+ * or folio pointers. We ignore any encoded bits, and turn any of
+ * them into just a folio that gets free'd.
+ */
+void release_pages(release_pages_arg arg, int nr)
+{
+	struct folio_batch fbatch;
+	int refs[PAGEVEC_SIZE];
+	struct encoded_page **encoded = arg.encoded_pages;
+	int i;
+
+	folio_batch_init(&fbatch);
+	for (i = 0; i < nr; i++) {
+		/* Turn any of the argument types into a folio */
+		struct folio *folio = page_folio(encoded_page_ptr(encoded[i]));
+
+		/* Is our next entry actually "nr_pages" -> "nr_refs" ? */
+		refs[fbatch.nr] = 1;
+		if (unlikely(encoded_page_flags(encoded[i]) &
+			     ENCODED_PAGE_BIT_NR_PAGES_NEXT))
+			refs[fbatch.nr] = encoded_nr_pages(encoded[++i]);
+
+		if (folio_batch_add(&fbatch, folio) > 0)
+			continue;
+		folios_put_refs(&fbatch, refs);
+	}
+
+	if (fbatch.nr)
+		folios_put_refs(&fbatch, refs);
 }
 EXPORT_SYMBOL(release_pages);

@@ -1065,8 +1095,7 @@ void __folio_batch_release(struct folio_batch *fbatch)
 		lru_add_drain();
 		fbatch->percpu_pvec_drained = true;
 	}
-	release_pages(fbatch->folios, folio_batch_count(fbatch));
-	folio_batch_reinit(fbatch);
+	folios_put(fbatch);
 }
 EXPORT_SYMBOL(__folio_batch_release);

--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1358,6 +1358,7 @@ const char * const vmstat_text[] = {
 	"thp_split_page",
 	"thp_split_page_failed",
 	"thp_deferred_split_page",
+	"thp_underused_split_page",
 	"thp_split_pmd",
 	"thp_shatter_page",
 	"thp_shatter_page_failed",
--- a/net/Makefile
+++ b/net/Makefile
@@ -17,7 +17,7 @@ obj-$(CONFIG_NETFILTER)		+= netfilter/
 obj-$(CONFIG_INET)		+= ipv4/
 obj-$(CONFIG_TLS)		+= tls/
 obj-$(CONFIG_XFRM)		+= xfrm/
-obj-$(CONFIG_UNIX_SCM)		+= unix/
+obj-$(CONFIG_UNIX)		+= unix/
 obj-y				+= ipv6/
 obj-$(CONFIG_BPFILTER)		+= bpfilter/
 obj-$(CONFIG_PACKET)		+= packet/
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -36,6 +36,7 @@
 #include <net/compat.h>
 #include <net/scm.h>
 #include <net/cls_cgroup.h>
+#include <net/af_unix.h>


 /*
@@ -85,8 +86,15 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
 			return -ENOMEM;
 		*fplp = fpl;
 		fpl->count = 0;
+		fpl->count_unix = 0;
 		fpl->max = SCM_MAX_FD;
 		fpl->user = NULL;
+#if IS_ENABLED(CONFIG_UNIX)
+		fpl->inflight = false;
+		fpl->dead = false;
+		fpl->edges = NULL;
+		INIT_LIST_HEAD(&fpl->vertices);
+#endif
 	}
 	fpp = &fpl->fp[fpl->count];

@@ -109,6 +117,9 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
 			fput(file);
 			return -EINVAL;
 		}
+		if (unix_get_socket(file))
+			fpl->count_unix++;
+
 		*fpp++ = file;
 		fpl->count++;
 	}
@@ -366,13 +377,18 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
 	if (!fpl)
 		return NULL;

-	new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]),
+	new_fpl = kmemdup(fpl, sizeof(*fpl),
 			  GFP_KERNEL_ACCOUNT);
 	if (new_fpl) {
 		for (i = 0; i < fpl->count; i++)
 			get_file(fpl->fp[i]);
 		new_fpl->max = new_fpl->count;
 		new_fpl->user = get_uid(fpl->user);
+#if IS_ENABLED(CONFIG_UNIX)
+		new_fpl->inflight = false;
+		new_fpl->edges = NULL;
+		INIT_LIST_HEAD(&new_fpl->vertices);
+#endif
 	}
 	return new_fpl;
 }
--- a/net/unix/Kconfig
+++ b/net/unix/Kconfig
@@ -16,11 +16,6 @@ config UNIX

 	  Say Y unless you know what you are doing.

-config UNIX_SCM
-	bool
-	depends on UNIX
-	default y
-
 config	AF_UNIX_OOB
 	bool
 	depends on UNIX
--- a/net/unix/Makefile
+++ b/net/unix/Makefile
@@ -11,5 +11,3 @@ unix-$(CONFIG_BPF_SYSCALL) += unix_bpf.o

 obj-$(CONFIG_UNIX_DIAG)	+= unix_diag.o
 unix_diag-y		:= diag.o
-
-obj-$(CONFIG_UNIX_SCM)	+= scm.o
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -117,8 +117,6 @@
 #include <linux/file.h>
 #include <linux/btf_ids.h>

-#include "scm.h"
-
 static atomic_long_t unix_nr_socks;
 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
@@ -980,11 +978,11 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern,
 	sk->sk_max_ack_backlog	= READ_ONCE(net->unx.sysctl_max_dgram_qlen);
 	sk->sk_destruct		= unix_sock_destructor;
 	u = unix_sk(sk);
-	u->inflight = 0;
+	u->listener = NULL;
+	u->vertex = NULL;
 	u->path.dentry = NULL;
 	u->path.mnt = NULL;
 	spin_lock_init(&u->lock);
-	INIT_LIST_HEAD(&u->link);
 	mutex_init(&u->iolock); /* single task reading lock */
 	mutex_init(&u->bindlock); /* single task binding lock */
 	init_waitqueue_head(&u->peer_wait);
@@ -1583,6 +1581,7 @@ restart:
 	newsk->sk_type		= sk->sk_type;
 	init_peercred(newsk);
 	newu = unix_sk(newsk);
+	newu->listener = other;
 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
 	otheru = unix_sk(other);

@@ -1678,8 +1677,8 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
 		       bool kern)
 {
 	struct sock *sk = sock->sk;
-	struct sock *tsk;
 	struct sk_buff *skb;
+	struct sock *tsk;
 	int err;

 	err = -EOPNOTSUPP;
@@ -1709,6 +1708,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,

 	/* attach accepted sock to socket */
 	unix_state_lock(tsk);
+	unix_update_edges(unix_sk(tsk));
 	newsock->state = SS_CONNECTED;
 	unix_sock_inherit_flags(sock, newsock);
 	sock_graft(tsk, newsock);
@@ -1752,51 +1752,65 @@ out:
 	return err;
 }

+/* The "user->unix_inflight" variable is protected by the garbage
+ * collection lock, and we just read it locklessly here. If you go
+ * over the limit, there might be a tiny race in actually noticing
+ * it across threads. Tough.
+ */
+static inline bool too_many_unix_fds(struct task_struct *p)
+{
+	struct user_struct *user = current_user();
+
+	if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
+		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
+	return false;
+}
+
+static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
+{
+	if (too_many_unix_fds(current))
+		return -ETOOMANYREFS;
+
+	/* Need to duplicate file references for the sake of garbage
+	 * collection.  Otherwise a socket in the fps might become a
+	 * candidate for GC while the skb is not yet queued.
+	 */
+	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
+	if (!UNIXCB(skb).fp)
+		return -ENOMEM;
+
+	if (unix_prepare_fpl(UNIXCB(skb).fp))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
+{
+	scm->fp = UNIXCB(skb).fp;
+	UNIXCB(skb).fp = NULL;
+
+	unix_destroy_fpl(scm->fp);
+}
+
 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
 {
 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
+}

-	/*
-	 * Garbage collection of unix sockets starts by selecting a set of
-	 * candidate sockets which have reference only from being in flight
-	 * (total_refs == inflight_refs).  This condition is checked once during
-	 * the candidate collection phase, and candidates are marked as such, so
-	 * that non-candidates can later be ignored.  While inflight_refs is
-	 * protected by unix_gc_lock, total_refs (file count) is not, hence this
-	 * is an instantaneous decision.
-	 *
-	 * Once a candidate, however, the socket must not be reinstalled into a
-	 * file descriptor while the garbage collection is in progress.
-	 *
-	 * If the above conditions are met, then the directed graph of
-	 * candidates (*) does not change while unix_gc_lock is held.
-	 *
-	 * Any operations that changes the file count through file descriptors
-	 * (dup, close, sendmsg) does not change the graph since candidates are
-	 * not installed in fds.
-	 *
-	 * Dequeing a candidate via recvmsg would install it into an fd, but
-	 * that takes unix_gc_lock to decrement the inflight count, so it's
-	 * serialized with garbage collection.
-	 *
-	 * MSG_PEEK is special in that it does not change the inflight count,
-	 * yet does install the socket into an fd.  The following lock/unlock
-	 * pair is to ensure serialization with garbage collection.  It must be
-	 * done between incrementing the file count and installing the file into
-	 * an fd.
-	 *
-	 * If garbage collection starts after the barrier provided by the
-	 * lock/unlock, then it will see the elevated refcount and not mark this
-	 * as a candidate.  If a garbage collection is already in progress
-	 * before the file count was incremented, then the lock/unlock pair will
-	 * ensure that garbage collection is finished before progressing to
-	 * installing the fd.
-	 *
-	 * (*) A -> B where B is on the queue of A or B is on the queue of C
-	 * which is on the queue of listening socket A.
-	 */
-	spin_lock(&unix_gc_lock);
-	spin_unlock(&unix_gc_lock);
+static void unix_destruct_scm(struct sk_buff *skb)
+{
+	struct scm_cookie scm;
+
+	memset(&scm, 0, sizeof(scm));
+	scm.pid  = UNIXCB(skb).pid;
+	if (UNIXCB(skb).fp)
+		unix_detach_fds(&scm, skb);
+
+	/* Alas, it calls VFS */
+	/* So fscking what? fput() had been SMP-safe since the last Summer */
+	scm_destroy(&scm);
+	sock_wfree(skb);
 }

 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
@@ -1855,8 +1869,10 @@ static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
 	struct scm_fp_list *fp = UNIXCB(skb).fp;
 	struct unix_sock *u = unix_sk(sk);

-	if (unlikely(fp && fp->count))
+	if (unlikely(fp && fp->count)) {
 		atomic_add(fp->count, &u->scm_stat.nr_fds);
+		unix_add_edges(fp, u);
+	}
 }

 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
@@ -1864,8 +1880,10 @@ static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
 	struct scm_fp_list *fp = UNIXCB(skb).fp;
 	struct unix_sock *u = unix_sk(sk);

-	if (unlikely(fp && fp->count))
+	if (unlikely(fp && fp->count)) {
 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
+		unix_del_edges(fp);
+	}
 }

 /*
@@ -1885,11 +1903,12 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
 	long timeo;
 	int err;

-	wait_for_unix_gc();
 	err = scm_send(sock, msg, &scm, false);
 	if (err < 0)
 		return err;

+	wait_for_unix_gc(scm.fp);
+
 	err = -EOPNOTSUPP;
 	if (msg->msg_flags&MSG_OOB)
 		goto out;
@@ -2157,11 +2176,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 	bool fds_sent = false;
 	int data_len;

-	wait_for_unix_gc();
 	err = scm_send(sock, msg, &scm, false);
 	if (err < 0)
 		return err;

+	wait_for_unix_gc(scm.fp);
+
 	err = -EOPNOTSUPP;
 	if (msg->msg_flags & MSG_OOB) {
 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -81,249 +81,519 @@
 #include <net/scm.h>
 #include <net/tcp_states.h>

-#include "scm.h"
-
-/* Internal data structures and random procedures: */
-
-static LIST_HEAD(gc_candidates);
-
-static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
-			  struct sk_buff_head *hitlist)
+struct unix_sock *unix_get_socket(struct file *filp)
 {
-	struct sk_buff *skb;
-	struct sk_buff *next;
+	struct inode *inode = file_inode(filp);

-	spin_lock(&x->sk_receive_queue.lock);
-	skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
-		/* Do we have file descriptors ? */
-		if (UNIXCB(skb).fp) {
-			bool hit = false;
-			/* Process the descriptors of this socket */
-			int nfd = UNIXCB(skb).fp->count;
-			struct file **fp = UNIXCB(skb).fp->fp;
+	/* Socket ? */
+	if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) {
+		struct socket *sock = SOCKET_I(inode);
+		const struct proto_ops *ops;
+		struct sock *sk = sock->sk;

-			while (nfd--) {
-				/* Get the socket the fd matches if it indeed does so */
-				struct unix_sock *u = unix_get_socket(*fp++);
+		ops = READ_ONCE(sock->ops);

-				/* Ignore non-candidates, they could have been added
-				 * to the queues after starting the garbage collection
-				 */
-				if (u && test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) {
-					hit = true;
-
-					func(u);
-				}
-			}
-			if (hit && hitlist != NULL) {
-				__skb_unlink(skb, &x->sk_receive_queue);
-				__skb_queue_tail(hitlist, skb);
-			}
-		}
+		/* PF_UNIX ? */
+		if (sk && ops && ops->family == PF_UNIX)
+			return unix_sk(sk);
 	}
-	spin_unlock(&x->sk_receive_queue.lock);
+
+	return NULL;
 }

-static void scan_children(struct sock *x, void (*func)(struct unix_sock *),
-			  struct sk_buff_head *hitlist)
+static struct unix_vertex *unix_edge_successor(struct unix_edge *edge)
 {
-	if (x->sk_state != TCP_LISTEN) {
-		scan_inflight(x, func, hitlist);
-	} else {
-		struct sk_buff *skb;
-		struct sk_buff *next;
-		struct unix_sock *u;
-		LIST_HEAD(embryos);
-
-		/* For a listening socket collect the queued embryos
-		 * and perform a scan on them as well.
-		 */
-		spin_lock(&x->sk_receive_queue.lock);
-		skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
-			u = unix_sk(skb->sk);
-
-			/* An embryo cannot be in-flight, so it's safe
-			 * to use the list link.
-			 */
-			BUG_ON(!list_empty(&u->link));
-			list_add_tail(&u->link, &embryos);
-		}
-		spin_unlock(&x->sk_receive_queue.lock);
-
-		while (!list_empty(&embryos)) {
-			u = list_entry(embryos.next, struct unix_sock, link);
-			scan_inflight(&u->sk, func, hitlist);
-			list_del_init(&u->link);
-		}
-	}
-}
-
-static void dec_inflight(struct unix_sock *usk)
-{
-	usk->inflight--;
-}
-
-static void inc_inflight(struct unix_sock *usk)
-{
-	usk->inflight++;
-}
-
-static void inc_inflight_move_tail(struct unix_sock *u)
-{
-	u->inflight++;
-
-	/* If this still might be part of a cycle, move it to the end
-	 * of the list, so that it's checked even if it was already
-	 * passed over
+	/* If an embryo socket has a fd,
+	 * the listener indirectly holds the fd's refcnt.
 	 */
-	if (test_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags))
-		list_move_tail(&u->link, &gc_candidates);
+	if (edge->successor->listener)
+		return unix_sk(edge->successor->listener)->vertex;
+
+	return edge->successor->vertex;
+}
+
+static bool unix_graph_maybe_cyclic;
+static bool unix_graph_grouped;
+
+static void unix_update_graph(struct unix_vertex *vertex)
+{
+	/* If the receiver socket is not inflight, no cyclic
+	 * reference could be formed.
+	 */
+	if (!vertex)
+		return;
+
+	unix_graph_maybe_cyclic = true;
+	unix_graph_grouped = false;
+}
+
+static LIST_HEAD(unix_unvisited_vertices);
+
+enum unix_vertex_index {
+	UNIX_VERTEX_INDEX_MARK1,
+	UNIX_VERTEX_INDEX_MARK2,
+	UNIX_VERTEX_INDEX_START,
+};
+
+static unsigned long unix_vertex_unvisited_index = UNIX_VERTEX_INDEX_MARK1;
+
+static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
+{
+	struct unix_vertex *vertex = edge->predecessor->vertex;
+
+	if (!vertex) {
+		vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry);
+		vertex->index = unix_vertex_unvisited_index;
+		vertex->out_degree = 0;
+		INIT_LIST_HEAD(&vertex->edges);
+		INIT_LIST_HEAD(&vertex->scc_entry);
+
+		list_move_tail(&vertex->entry, &unix_unvisited_vertices);
+		edge->predecessor->vertex = vertex;
+	}
+
+	vertex->out_degree++;
+	list_add_tail(&edge->vertex_entry, &vertex->edges);
+
+	unix_update_graph(unix_edge_successor(edge));
+}
+
+static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
+{
+	struct unix_vertex *vertex = edge->predecessor->vertex;
+
+	if (!fpl->dead)
+		unix_update_graph(unix_edge_successor(edge));
+
+	list_del(&edge->vertex_entry);
+	vertex->out_degree--;
+
+	if (!vertex->out_degree) {
+		edge->predecessor->vertex = NULL;
+		list_move_tail(&vertex->entry, &fpl->vertices);
+	}
+}
+
+static void unix_free_vertices(struct scm_fp_list *fpl)
+{
+	struct unix_vertex *vertex, *next_vertex;
+
+	list_for_each_entry_safe(vertex, next_vertex, &fpl->vertices, entry) {
+		list_del(&vertex->entry);
+		kfree(vertex);
+	}
+}
+
+static DEFINE_SPINLOCK(unix_gc_lock);
+unsigned int unix_tot_inflight;
+
+void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver)
+{
+	int i = 0, j = 0;
+
+	spin_lock(&unix_gc_lock);
+
+	if (!fpl->count_unix)
+		goto out;
+
+	do {
+		struct unix_sock *inflight = unix_get_socket(fpl->fp[j++]);
+		struct unix_edge *edge;
+
+		if (!inflight)
+			continue;
+
+		edge = fpl->edges + i++;
+		edge->predecessor = inflight;
+		edge->successor = receiver;
+
+		unix_add_edge(fpl, edge);
+	} while (i < fpl->count_unix);
+
+	receiver->scm_stat.nr_unix_fds += fpl->count_unix;
+	WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + fpl->count_unix);
+out:
+	WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight + fpl->count);
+
+	spin_unlock(&unix_gc_lock);
+
+	fpl->inflight = true;
+
+	unix_free_vertices(fpl);
+}
+
+void unix_del_edges(struct scm_fp_list *fpl)
+{
+	struct unix_sock *receiver;
+	int i = 0;
+
+	spin_lock(&unix_gc_lock);
+
+	if (!fpl->count_unix)
+		goto out;
+
+	do {
+		struct unix_edge *edge = fpl->edges + i++;
+
+		unix_del_edge(fpl, edge);
+	} while (i < fpl->count_unix);
+
+	if (!fpl->dead) {
+		receiver = fpl->edges[0].successor;
+		receiver->scm_stat.nr_unix_fds -= fpl->count_unix;
+	}
+	WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - fpl->count_unix);
+out:
+	WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count);
+
+	spin_unlock(&unix_gc_lock);
+
+	fpl->inflight = false;
+}
+
+void unix_update_edges(struct unix_sock *receiver)
+{
+	/* nr_unix_fds is only updated under unix_state_lock().
+	 * If it's 0 here, the embryo socket is not part of the
+	 * inflight graph, and GC will not see it, so no lock needed.
+	 */
+	if (!receiver->scm_stat.nr_unix_fds) {
+		receiver->listener = NULL;
+	} else {
+		spin_lock(&unix_gc_lock);
+		unix_update_graph(unix_sk(receiver->listener)->vertex);
+		receiver->listener = NULL;
+		spin_unlock(&unix_gc_lock);
+	}
+}
+
+int unix_prepare_fpl(struct scm_fp_list *fpl)
+{
+	struct unix_vertex *vertex;
+	int i;
+
+	if (!fpl->count_unix)
+		return 0;
+
+	for (i = 0; i < fpl->count_unix; i++) {
+		vertex = kmalloc(sizeof(*vertex), GFP_KERNEL);
+		if (!vertex)
+			goto err;
+
+		list_add(&vertex->entry, &fpl->vertices);
+	}
+
+	fpl->edges = kvmalloc_array(fpl->count_unix, sizeof(*fpl->edges),
+				    GFP_KERNEL_ACCOUNT);
+	if (!fpl->edges)
+		goto err;
+
+	return 0;
+
+err:
+	unix_free_vertices(fpl);
+	return -ENOMEM;
+}
+
+void unix_destroy_fpl(struct scm_fp_list *fpl)
+{
+	if (fpl->inflight)
+		unix_del_edges(fpl);
+
+	kvfree(fpl->edges);
+	unix_free_vertices(fpl);
+}
+
+static bool unix_vertex_dead(struct unix_vertex *vertex)
+{
+	struct unix_edge *edge;
+	struct unix_sock *u;
+	long total_ref;
+
+	list_for_each_entry(edge, &vertex->edges, vertex_entry) {
+		struct unix_vertex *next_vertex = unix_edge_successor(edge);
+
+		/* The vertex's fd can be received by a non-inflight socket. */
+		if (!next_vertex)
+			return false;
+
+		/* The vertex's fd can be received by an inflight socket in
+		 * another SCC.
+		 */
+		if (next_vertex->scc_index != vertex->scc_index)
+			return false;
+	}
+
+	/* No receiver exists out of the same SCC. */
+
+	edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry);
+	u = edge->predecessor;
+	total_ref = file_count(u->sk.sk_socket->file);
+
+	/* If not close()d, total_ref > out_degree. */
+	if (total_ref != vertex->out_degree)
+		return false;
+
+	return true;
+}
+
+enum unix_recv_queue_lock_class {
+	U_RECVQ_LOCK_NORMAL,
+	U_RECVQ_LOCK_EMBRYO,
+};
+
+static void unix_collect_queue(struct unix_sock *u, struct sk_buff_head *hitlist)
+{
+	skb_queue_splice_init(&u->sk.sk_receive_queue, hitlist);
+
+#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
+	if (u->oob_skb) {
+		WARN_ON_ONCE(skb_unref(u->oob_skb));
+		u->oob_skb = NULL;
+	}
+#endif
+}
+
+static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist)
+{
+	struct unix_vertex *vertex;
+
+	list_for_each_entry_reverse(vertex, scc, scc_entry) {
+		struct sk_buff_head *queue;
+		struct unix_edge *edge;
+		struct unix_sock *u;
+
+		edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry);
+		u = edge->predecessor;
+		queue = &u->sk.sk_receive_queue;
+
+		spin_lock(&queue->lock);
+
+		if (u->sk.sk_state == TCP_LISTEN) {
+			struct sk_buff *skb;
+
+			skb_queue_walk(queue, skb) {
+				struct sk_buff_head *embryo_queue = &skb->sk->sk_receive_queue;
+
+				/* listener -> embryo order, the inversion never happens. */
+				spin_lock_nested(&embryo_queue->lock, U_RECVQ_LOCK_EMBRYO);
+				unix_collect_queue(unix_sk(skb->sk), hitlist);
+				spin_unlock(&embryo_queue->lock);
+			}
+		} else {
+			unix_collect_queue(u, hitlist);
+		}
+
+		spin_unlock(&queue->lock);
+	}
+}
+
+static bool unix_scc_cyclic(struct list_head *scc)
+{
+	struct unix_vertex *vertex;
+	struct unix_edge *edge;
+
+	/* SCC containing multiple vertices ? */
+	if (!list_is_singular(scc))
+		return true;
+
+	vertex = list_first_entry(scc, typeof(*vertex), scc_entry);
+
+	/* Self-reference or a embryo-listener circle ? */
+	list_for_each_entry(edge, &vertex->edges, vertex_entry) {
+		if (unix_edge_successor(edge) == vertex)
+			return true;
+	}
+
+	return false;
+}
+
+static LIST_HEAD(unix_visited_vertices);
+static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2;
+
+static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index,
+			    struct sk_buff_head *hitlist)
+{
+	LIST_HEAD(vertex_stack);
+	struct unix_edge *edge;
+	LIST_HEAD(edge_stack);
+
+next_vertex:
+	/* Push vertex to vertex_stack and mark it as on-stack
+	 * (index >= UNIX_VERTEX_INDEX_START).
+	 * The vertex will be popped when finalising SCC later.
+	 */
+	list_add(&vertex->scc_entry, &vertex_stack);
+
+	vertex->index = *last_index;
+	vertex->scc_index = *last_index;
+	(*last_index)++;
+
+	/* Explore neighbour vertices (receivers of the current vertex's fd). */
+	list_for_each_entry(edge, &vertex->edges, vertex_entry) {
+		struct unix_vertex *next_vertex = unix_edge_successor(edge);
+
+		if (!next_vertex)
+			continue;
+
+		if (next_vertex->index == unix_vertex_unvisited_index) {
+			/* Iterative deepening depth first search
+			 *
+			 *   1. Push a forward edge to edge_stack and set
+			 *      the successor to vertex for the next iteration.
+			 */
+			list_add(&edge->stack_entry, &edge_stack);
+
+			vertex = next_vertex;
+			goto next_vertex;
+
+			/*   2. Pop the edge directed to the current vertex
+			 *      and restore the ancestor for backtracking.
+			 */
+prev_vertex:
+			edge = list_first_entry(&edge_stack, typeof(*edge), stack_entry);
+			list_del_init(&edge->stack_entry);
+
+			next_vertex = vertex;
+			vertex = edge->predecessor->vertex;
+
+			/* If the successor has a smaller scc_index, two vertices
+			 * are in the same SCC, so propagate the smaller scc_index
+			 * to skip SCC finalisation.
+			 */
+			vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index);
+		} else if (next_vertex->index != unix_vertex_grouped_index) {
+			/* Loop detected by a back/cross edge.
+			 *
+			 * The successor is on vertex_stack, so two vertices are in
+			 * the same SCC.  If the successor has a smaller *scc_index*,
+			 * propagate it to skip SCC finalisation.
+			 */
+			vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index);
+		} else {
+			/* The successor was already grouped as another SCC */
+		}
+	}
+
+	if (vertex->index == vertex->scc_index) {
+		struct unix_vertex *v;
+		struct list_head scc;
+		bool scc_dead = true;
+
+		/* SCC finalised.
+		 *
+		 * If the scc_index was not updated, all the vertices above on
+		 * vertex_stack are in the same SCC.  Group them using scc_entry.
+		 */
+		__list_cut_position(&scc, &vertex_stack, &vertex->scc_entry);
+
+		list_for_each_entry_reverse(v, &scc, scc_entry) {
+			/* Don't restart DFS from this vertex in unix_walk_scc(). */
+			list_move_tail(&v->entry, &unix_visited_vertices);
+
+			/* Mark vertex as off-stack. */
+			v->index = unix_vertex_grouped_index;
+
+			if (scc_dead)
+				scc_dead = unix_vertex_dead(v);
+		}
+
+		if (scc_dead)
+			unix_collect_skb(&scc, hitlist);
+		else if (!unix_graph_maybe_cyclic)
+			unix_graph_maybe_cyclic = unix_scc_cyclic(&scc);
+
+		list_del(&scc);
+	}
+
+	/* Need backtracking ? */
+	if (!list_empty(&edge_stack))
+		goto prev_vertex;
+}
+
+static void unix_walk_scc(struct sk_buff_head *hitlist)
+{
+	unsigned long last_index = UNIX_VERTEX_INDEX_START;
+
+	unix_graph_maybe_cyclic = false;
+
+	/* Visit every vertex exactly once.
+	 * __unix_walk_scc() moves visited vertices to unix_visited_vertices.
+	 */
+	while (!list_empty(&unix_unvisited_vertices)) {
+		struct unix_vertex *vertex;
+
+		vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
+		__unix_walk_scc(vertex, &last_index, hitlist);
+	}
+
+	list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
+	swap(unix_vertex_unvisited_index, unix_vertex_grouped_index);
+
+	unix_graph_grouped = true;
+}
+
+static void unix_walk_scc_fast(struct sk_buff_head *hitlist)
+{
+	unix_graph_maybe_cyclic = false;
+
+	while (!list_empty(&unix_unvisited_vertices)) {
+		struct unix_vertex *vertex;
+		struct list_head scc;
+		bool scc_dead = true;
+
+		vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
+		list_add(&scc, &vertex->scc_entry);
+
+		list_for_each_entry_reverse(vertex, &scc, scc_entry) {
+			list_move_tail(&vertex->entry, &unix_visited_vertices);
+
+			if (scc_dead)
+				scc_dead = unix_vertex_dead(vertex);
+		}
+
+		if (scc_dead)
+			unix_collect_skb(&scc, hitlist);
+		else if (!unix_graph_maybe_cyclic)
+			unix_graph_maybe_cyclic = unix_scc_cyclic(&scc);
+
+		list_del(&scc);
+	}
+
+	list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
 }

 static bool gc_in_progress;

 static void __unix_gc(struct work_struct *work)
 {
-	struct sk_buff *next_skb, *skb;
-	struct unix_sock *u;
-	struct unix_sock *next;
 	struct sk_buff_head hitlist;
-	struct list_head cursor;
-	LIST_HEAD(not_cycle_list);
+	struct sk_buff *skb;

 	spin_lock(&unix_gc_lock);

-	/* First, select candidates for garbage collection.  Only
-	 * in-flight sockets are considered, and from those only ones
-	 * which don't have any external reference.
-	 *
-	 * Holding unix_gc_lock will protect these candidates from
-	 * being detached, and hence from gaining an external
-	 * reference.  Since there are no possible receivers, all
-	 * buffers currently on the candidates' queues stay there
-	 * during the garbage collection.
-	 *
-	 * We also know that no new candidate can be added onto the
-	 * receive queues.  Other, non candidate sockets _can_ be
-	 * added to queue, so we must make sure only to touch
-	 * candidates.
-	 *
-	 * Embryos, though never candidates themselves, affect which
-	 * candidates are reachable by the garbage collector.  Before
-	 * being added to a listener's queue, an embryo may already
-	 * receive data carrying SCM_RIGHTS, potentially making the
-	 * passed socket a candidate that is not yet reachable by the
-	 * collector.  It becomes reachable once the embryo is
-	 * enqueued.  Therefore, we must ensure that no SCM-laden
-	 * embryo appears in a (candidate) listener's queue between
-	 * consecutive scan_children() calls.
-	 */
-	list_for_each_entry_safe(u, next, &gc_inflight_list, link) {
-		struct sock *sk = &u->sk;
-		long total_refs;
-
-		total_refs = file_count(sk->sk_socket->file);
-
-		BUG_ON(!u->inflight);
-		BUG_ON(total_refs < u->inflight);
-		if (total_refs == u->inflight) {
-			list_move_tail(&u->link, &gc_candidates);
-			__set_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
-			__set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
-
-			if (sk->sk_state == TCP_LISTEN) {
-				unix_state_lock_nested(sk, U_LOCK_GC_LISTENER);
-				unix_state_unlock(sk);
-			}
-		}
+	if (!unix_graph_maybe_cyclic) {
+		spin_unlock(&unix_gc_lock);
+		goto skip_gc;
 	}

-	/* Now remove all internal in-flight reference to children of
-	 * the candidates.
-	 */
-	list_for_each_entry(u, &gc_candidates, link)
-		scan_children(&u->sk, dec_inflight, NULL);
+	__skb_queue_head_init(&hitlist);

-	/* Restore the references for children of all candidates,
-	 * which have remaining references.  Do this recursively, so
-	 * only those remain, which form cyclic references.
-	 *
-	 * Use a "cursor" link, to make the list traversal safe, even
-	 * though elements might be moved about.
-	 */
-	list_add(&cursor, &gc_candidates);
-	while (cursor.next != &gc_candidates) {
-		u = list_entry(cursor.next, struct unix_sock, link);
-
-		/* Move cursor to after the current position. */
-		list_move(&cursor, &u->link);
-
-		if (u->inflight) {
-			list_move_tail(&u->link, &not_cycle_list);
-			__clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
-			scan_children(&u->sk, inc_inflight_move_tail, NULL);
-		}
-	}
-	list_del(&cursor);
-
-	/* Now gc_candidates contains only garbage.  Restore original
-	 * inflight counters for these as well, and remove the skbuffs
-	 * which are creating the cycle(s).
-	 */
-	skb_queue_head_init(&hitlist);
-	list_for_each_entry(u, &gc_candidates, link) {
-		scan_children(&u->sk, inc_inflight, &hitlist);
-
-#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
-		if (u->oob_skb) {
-			kfree_skb(u->oob_skb);
-			u->oob_skb = NULL;
-		}
-#endif
-	}
-
-	/* not_cycle_list contains those sockets which do not make up a
-	 * cycle.  Restore these to the inflight list.
-	 */
-	while (!list_empty(&not_cycle_list)) {
-		u = list_entry(not_cycle_list.next, struct unix_sock, link);
-		__clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
-		list_move_tail(&u->link, &gc_inflight_list);
-	}
+	if (unix_graph_grouped)
+		unix_walk_scc_fast(&hitlist);
+	else
+		unix_walk_scc(&hitlist);

 	spin_unlock(&unix_gc_lock);

-	/* We need io_uring to clean its registered files, ignore all io_uring
-	 * originated skbs. It's fine as io_uring doesn't keep references to
-	 * other io_uring instances and so killing all other files in the cycle
-	 * will put all io_uring references forcing it to go through normal
-	 * release.path eventually putting registered files.
-	 */
-	skb_queue_walk_safe(&hitlist, skb, next_skb) {
-		if (skb->destructor == io_uring_destruct_scm) {
-			__skb_unlink(skb, &hitlist);
-			skb_queue_tail(&skb->sk->sk_receive_queue, skb);
-		}
+	skb_queue_walk(&hitlist, skb) {
+		if (UNIXCB(skb).fp)
+			UNIXCB(skb).fp->dead = true;
 	}

-	/* Here we are. Hitlist is filled. Die. */
 	__skb_queue_purge(&hitlist);
-
-	spin_lock(&unix_gc_lock);
-
-	/* There could be io_uring registered files, just push them back to
-	 * the inflight list
-	 */
-	list_for_each_entry_safe(u, next, &gc_candidates, link)
-		list_move_tail(&u->link, &gc_inflight_list);
-
-	/* All candidates should have been detached by now. */
-	BUG_ON(!list_empty(&gc_candidates));
-
-	/* Paired with READ_ONCE() in wait_for_unix_gc(). */
+skip_gc:
 	WRITE_ONCE(gc_in_progress, false);
-
-	spin_unlock(&unix_gc_lock);
 }

 static DECLARE_WORK(unix_gc_work, __unix_gc);
@@ -335,8 +605,9 @@ void unix_gc(void)
 }

 #define UNIX_INFLIGHT_TRIGGER_GC 16000
+#define UNIX_INFLIGHT_SANE_USER (SCM_MAX_FD * 8)

-void wait_for_unix_gc(void)
+void wait_for_unix_gc(struct scm_fp_list *fpl)
 {
 	/* If number of inflight sockets is insane,
 	 * force a garbage collect right now.
@@ -348,6 +619,13 @@ void wait_for_unix_gc(void)
 	    !READ_ONCE(gc_in_progress))
 		unix_gc();

+	/* Penalise users who want to send AF_UNIX sockets
+	 * but whose sockets have not been received yet.
+	 */
+	if (!fpl || !fpl->count_unix ||
+	    READ_ONCE(fpl->user->unix_inflight) < UNIX_INFLIGHT_SANE_USER)
+		return;
+
 	if (READ_ONCE(gc_in_progress))
 		flush_work(&unix_gc_work);
 }
--- a/net/unix/scm.c
+++ b/net/unix/scm.c
@@ -1,156 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/socket.h>
-#include <linux/net.h>
-#include <linux/fs.h>
-#include <net/af_unix.h>
-#include <net/scm.h>
-#include <linux/init.h>
-#include <linux/io_uring.h>
-
-#include "scm.h"
-
-unsigned int unix_tot_inflight;
-EXPORT_SYMBOL(unix_tot_inflight);
-
-LIST_HEAD(gc_inflight_list);
-EXPORT_SYMBOL(gc_inflight_list);
-
-DEFINE_SPINLOCK(unix_gc_lock);
-EXPORT_SYMBOL(unix_gc_lock);
-
-struct unix_sock *unix_get_socket(struct file *filp)
-{
-	struct inode *inode = file_inode(filp);
-
-	/* Socket ? */
-	if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) {
-		struct socket *sock = SOCKET_I(inode);
-		const struct proto_ops *ops = READ_ONCE(sock->ops);
-		struct sock *s = sock->sk;
-
-		/* PF_UNIX ? */
-		if (s && ops && ops->family == PF_UNIX)
-			return unix_sk(s);
-	}
-
-	return NULL;
-}
-EXPORT_SYMBOL(unix_get_socket);
-
-/* Keep the number of times in flight count for the file
- * descriptor if it is for an AF_UNIX socket.
- */
-void unix_inflight(struct user_struct *user, struct file *fp)
-{
-	struct unix_sock *u = unix_get_socket(fp);
-
-	spin_lock(&unix_gc_lock);
-
-	if (u) {
-		if (!u->inflight) {
-			BUG_ON(!list_empty(&u->link));
-			list_add_tail(&u->link, &gc_inflight_list);
-		} else {
-			BUG_ON(list_empty(&u->link));
-		}
-		u->inflight++;
-		/* Paired with READ_ONCE() in wait_for_unix_gc() */
-		WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1);
-	}
-	WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1);
-	spin_unlock(&unix_gc_lock);
-}
-
-void unix_notinflight(struct user_struct *user, struct file *fp)
-{
-	struct unix_sock *u = unix_get_socket(fp);
-
-	spin_lock(&unix_gc_lock);
-
-	if (u) {
-		BUG_ON(!u->inflight);
-		BUG_ON(list_empty(&u->link));
-
-		u->inflight--;
-		if (!u->inflight)
-			list_del_init(&u->link);
-		/* Paired with READ_ONCE() in wait_for_unix_gc() */
-		WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1);
-	}
-	WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1);
-	spin_unlock(&unix_gc_lock);
-}
-
-/*
- * The "user->unix_inflight" variable is protected by the garbage
- * collection lock, and we just read it locklessly here. If you go
- * over the limit, there might be a tiny race in actually noticing
- * it across threads. Tough.
- */
-static inline bool too_many_unix_fds(struct task_struct *p)
-{
-	struct user_struct *user = current_user();
-
-	if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
-		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
-	return false;
-}
-
-int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
-{
-	int i;
-
-	if (too_many_unix_fds(current))
-		return -ETOOMANYREFS;
-
-	/*
-	 * Need to duplicate file references for the sake of garbage
-	 * collection.  Otherwise a socket in the fps might become a
-	 * candidate for GC while the skb is not yet queued.
-	 */
-	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
-	if (!UNIXCB(skb).fp)
-		return -ENOMEM;
-
-	for (i = scm->fp->count - 1; i >= 0; i--)
-		unix_inflight(scm->fp->user, scm->fp->fp[i]);
-	return 0;
-}
-EXPORT_SYMBOL(unix_attach_fds);
-
-void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
-{
-	int i;
-
-	scm->fp = UNIXCB(skb).fp;
-	UNIXCB(skb).fp = NULL;
-
-	for (i = scm->fp->count-1; i >= 0; i--)
-		unix_notinflight(scm->fp->user, scm->fp->fp[i]);
-}
-EXPORT_SYMBOL(unix_detach_fds);
-
-void unix_destruct_scm(struct sk_buff *skb)
-{
-	struct scm_cookie scm;
-
-	memset(&scm, 0, sizeof(scm));
-	scm.pid  = UNIXCB(skb).pid;
-	if (UNIXCB(skb).fp)
-		unix_detach_fds(&scm, skb);
-
-	/* Alas, it calls VFS */
-	/* So fscking what? fput() had been SMP-safe since the last Summer */
-	scm_destroy(&scm);
-	sock_wfree(skb);
-}
-EXPORT_SYMBOL(unix_destruct_scm);
-
-void io_uring_destruct_scm(struct sk_buff *skb)
-{
-	unix_destruct_scm(skb);
-}
-EXPORT_SYMBOL(io_uring_destruct_scm);
--- a/net/unix/scm.h
+++ b/net/unix/scm.h
@@ -1,10 +0,0 @@
-#ifndef NET_UNIX_SCM_H
-#define NET_UNIX_SCM_H
-
-extern struct list_head gc_inflight_list;
-extern spinlock_t unix_gc_lock;
-
-int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb);
-void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb);
-
-#endif
--- a/samples/ftrace/sample-trace-array.c
+++ b/samples/ftrace/sample-trace-array.c
@@ -105,7 +105,8 @@ static int __init sample_trace_array_init(void)
 	 * NOTE: This function increments the reference counter
 	 * associated with the trace array - "tr".
 	 */
-	tr = trace_array_get_by_name("sample-instance");
+	tr = trace_array_get_by_name_ext("sample-instance",
+					 "sched,timer,kprobes");

 	if (!tr)
 		return -1;
--- a/tools/testing/selftests/mm/split_huge_page_test.c
+++ b/tools/testing/selftests/mm/split_huge_page_test.c
@@ -88,6 +88,76 @@ static void write_debugfs(const char *fmt, ...)
 	}
 }

+static char *allocate_zero_filled_hugepage(size_t len)
+{
+	char *result;
+	size_t i;
+
+	result = memalign(pmd_pagesize, len);
+	if (!result) {
+		printf("Fail to allocate memory\n");
+		exit(EXIT_FAILURE);
+	}
+
+	madvise(result, len, MADV_HUGEPAGE);
+
+	for (i = 0; i < len; i++)
+		result[i] = (char)0;
+
+	return result;
+}
+
+static void verify_rss_anon_split_huge_page_all_zeroes(char *one_page, int nr_hpages, size_t len)
+{
+	unsigned long rss_anon_before, rss_anon_after;
+	size_t i;
+
+	if (!check_huge_anon(one_page, 4, pmd_pagesize)) {
+		printf("No THP is allocated\n");
+		exit(EXIT_FAILURE);
+	}
+
+	rss_anon_before = rss_anon();
+	if (!rss_anon_before) {
+		printf("No RssAnon is allocated before split\n");
+		exit(EXIT_FAILURE);
+	}
+
+	/* split all THPs */
+	write_debugfs(PID_FMT, getpid(), (uint64_t)one_page,
+		      (uint64_t)one_page + len, 0);
+
+	for (i = 0; i < len; i++)
+		if (one_page[i] != (char)0) {
+			printf("%ld byte corrupted\n", i);
+			exit(EXIT_FAILURE);
+		}
+
+	if (!check_huge_anon(one_page, 0, pmd_pagesize)) {
+		printf("Still AnonHugePages not split\n");
+		exit(EXIT_FAILURE);
+	}
+
+	rss_anon_after = rss_anon();
+	if (rss_anon_after >= rss_anon_before) {
+		printf("Incorrect RssAnon value. Before: %ld After: %ld\n",
+		       rss_anon_before, rss_anon_after);
+		exit(EXIT_FAILURE);
+	}
+}
+
+void split_pmd_zero_pages(void)
+{
+	char *one_page;
+	int nr_hpages = 4;
+	size_t len = nr_hpages * pmd_pagesize;
+
+	one_page = allocate_zero_filled_hugepage(len);
+	verify_rss_anon_split_huge_page_all_zeroes(one_page, nr_hpages, len);
+	printf("Split zero filled huge pages successful\n");
+	free(one_page);
+}
+
 void split_pmd_thp(void)
 {
 	char *one_page;
@@ -305,6 +375,7 @@ int main(int argc, char **argv)
 		exit(EXIT_FAILURE);
 	}

+	split_pmd_zero_pages();
 	split_pmd_thp();
 	split_pte_mapped_thp();
 	split_file_backed_thp();
--- a/tools/testing/selftests/mm/vm_util.c
+++ b/tools/testing/selftests/mm/vm_util.c
@@ -11,6 +11,7 @@

 #define PMD_SIZE_FILE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
 #define SMAP_FILE_PATH "/proc/self/smaps"
+#define STATUS_FILE_PATH "/proc/self/status"
 #define MAX_LINE_LENGTH 500

 unsigned int __page_size;
@@ -97,6 +98,27 @@ uint64_t read_pmd_pagesize(void)
 	return strtoul(buf, NULL, 10);
 }

+unsigned long rss_anon(void)
+{
+	unsigned long rss_anon = 0;
+	FILE *fp;
+	char buffer[MAX_LINE_LENGTH];
+
+	fp = fopen(STATUS_FILE_PATH, "r");
+	if (!fp)
+		ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, STATUS_FILE_PATH);
+
+	if (!check_for_pattern(fp, "RssAnon:", buffer, sizeof(buffer)))
+		goto err_out;
+
+	if (sscanf(buffer, "RssAnon:%10lu kB", &rss_anon) != 1)
+		ksft_exit_fail_msg("Reading status error\n");
+
+err_out:
+	fclose(fp);
+	return rss_anon;
+}
+
 bool __check_huge(void *addr, char *pattern, int nr_hpages,
 		  uint64_t hpage_size)
 {
--- a/tools/testing/selftests/mm/vm_util.h
+++ b/tools/testing/selftests/mm/vm_util.h
@@ -39,6 +39,7 @@ unsigned long pagemap_get_pfn(int fd, char *start);
 void clear_softdirty(void);
 bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len);
 uint64_t read_pmd_pagesize(void);
+unsigned long rss_anon(void);
 bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size);
 bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size);
 bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size);