Merge android15-6.6 into android15-6.6-lts

This merges the android15-6.6 branch into the -lts branch, catching
it up with the latest changes in there.

It contains the following commits:

* 3a0107a38e ANDROID: KVM: arm64: Ensure SVE initialization precedes PSCI for protected VCPUs
* 3b75103301 ANDROID: 16K: Use vma_area slab cache for pad VMA
* a213abada8 UPSTREAM: af_unix: Fix uninit-value in __unix_walk_scc()
* 5156d49ed9 UPSTREAM: af_unix: Fix garbage collection of embryos carrying OOB with SCM_RIGHTS
* fbd783363d ANDROID: af_unix: Provide ABI fixes for recently introduced 'struct scm_fp_list' attribute 'dead'
* ddd6979a15 UPSTREAM: af_unix: Add dead flag to struct scm_fp_list.
* 95a397ac6b UPSTREAM: af_unix: Don't access successor in unix_del_edges() during GC.
* a130d07d24 UPSTREAM: af_unix: Try not to hold unix_gc_lock during accept().
* 5ada288086 UPSTREAM: af_unix: Remove lock dance in unix_peek_fds().
* 11d208f893 UPSTREAM: af_unix: Replace garbage collection algorithm.
* 67a3a58da1 UPSTREAM: af_unix: Detect dead SCC.
* b9f8dfdb54 UPSTREAM: af_unix: Assign a unique index to SCC.
* b22b0a7597 UPSTREAM: af_unix: Avoid Tarjan's algorithm if unnecessary.
* 1e4d62adeb UPSTREAM: af_unix: Skip GC if no cycle exists.
* 250c362acd UPSTREAM: af_unix: Save O(n) setup of Tarjan's algo.
* 0c40a05117 UPSTREAM: af_unix: Fix up unix_edge.successor for embryo socket.
* f5ea8b439d UPSTREAM: af_unix: Save listener for embryo socket.
* 279ed20d5f UPSTREAM: af_unix: Detect Strongly Connected Components.
* 16dca90335 UPSTREAM: af_unix: Iterate all vertices by DFS.
* 80df4d17af UPSTREAM: af_unix: Bulk update unix_tot_inflight/unix_inflight when queuing skb.
* 40549e6976 ANDROID: af_unix: Provide ABI fixes for recently introduced 'struct scm_fp_list' attribute 'inflight'
* 769fc01f23 UPSTREAM: af_unix: Link struct unix_edge when queuing skb.
* de6b1e85b9 ANDROID: af_unix: Provide ABI fixes for recently introduced 'struct scm_fp_list' attribute 'edges'
* 844c9666eb UPSTREAM: af_unix: Allocate struct unix_edge for each inflight AF_UNIX fd.
* c93b3ba51e ANDROID: af_unix: Provide ABI fixes for recently introduced 'struct scm_fp_list' attribute 'vertices'
* ffef32ddaf UPSTREAM: af_unix: Allocate struct unix_vertex for each inflight AF_UNIX fd.
* f972f2d7b1 ANDROID: af_unix: Allocate memory for the largest possible size of 'struct scm_fp_list'
* b077571da9 UPSTREAM: af_unix: Remove CONFIG_UNIX_SCM.
* a390e62751 ANDROID: Align x86-64 microdroid cgroup support with aarch64 microdroid
* 6dbb3c2e90 BACKPORT: mm: remove folio from deferred split list before uncharging it
* a8553b4e2a BACKPORT: mm: use __page_cache_release() in folios_put()
* 4d61851d14 UPSTREAM: mm: fix list corruption in put_pages_list
* f61f355bdc UPSTREAM: mm: use free_unref_folios() in put_pages_list()
* 316b2e6e4b BACKPORT: mm: remove use of folio list from folios_put()
* f9c6fb1b82 BACKPORT: memcg: add mem_cgroup_uncharge_folios()
* 3bc695b2be Merge tag 'android15-6.6.92_r00' into android15-6.6
* 0813441033 FROMGIT: scsi: core: ufs: Fix a hang in the error handler
* a74f052176 FROMGIT: genirq/cpuhotplug: Restore affinity even for suspended IRQ
* fc6844d9d2 FROMGIT: genirq/cpuhotplug: Rebalance managed interrupts across multi-CPU hotplug
* 0bc63a98d9 ANDROID: abi_gki_aarch64_vivo: Update symbol list
* 8fb77f6f9d ANDROID: mm: Reset unused page flag bits on free
* f0bd864fe0 Revert "ANDROID: mm: Set PAGE_BLOCK_ORDER to 7 when ARM64_16K_PAGES"
* 97f5b70ad3 ANDROID: GKI: Update symbol list for xiaomi
* 2bc7bc937c BACKPORT: erofs: lazily initialize per-CPU workers and CPU hotplug hooks
* 434940a426 FROMGIT: scsi: ufs: mcq: Delete ufshcd_release_scsi_cmd() in ufshcd_mcq_abort()
* 0ac9aa9b62 ANDROID: GKI: Rename xring's symbol list.
* f56b0532df BACKPORT: mm: set pageblock_order to HPAGE_PMD_ORDER in case with !CONFIG_HUGETLB_PAGE but THP enabled
* f19494634f ANDROID: GKI: Update symbol list for vivo
* 68191d9c7a ANDROID: vendor_hooks: add hook to retry mempool allocation without delay
* 45afa56280 ANDROID: mm: Set PAGE_BLOCK_ORDER to 7 when ARM64_16K_PAGES
* 3148030c78 ANDROID: KVM: arm64: Fix hyp_alloc(0)
* 4ec55296c6 ANDROID: fix out-of-bounds error when trace_create_new_event
* d9ec0e18f4 ANDROID: CONFIG_CRYPTO_SHA1_ARM64_CE=y to GKI and Microdroid kernel
* 0272a2ffdc BACKPORT: FROMGIT: mm: add CONFIG_PAGE_BLOCK_ORDER to select page block order
* 86ba3f3eb2 BACKPORT: binder: Create safe versions of binder log files
* 8a55e7a02a UPSTREAM: binder: Refactor binder_node print synchronization
* fe02cfa135 ANDROID: iommu/arm-smmu-v3-kvm: Fix accidental domain ID freeing in free()
* 9733cd1fa2 ANDROID: GKI: Update xiaomi symbol list.
* 125f87a148 UPSTREAM: mm/memcg: use kmem_cache when alloc memcg pernode info
* 78e6a3d422 UPSTREAM: mm/memcg: use kmem_cache when alloc memcg
* b6bde4b648 UPSTREAM: mm/memcg: move mem_cgroup_init() ahead of cgroup_init()
* 476cb9bc9b UPSTREAM: af_unix: Remove io_uring code for GC.
* fb219cbb0b UPSTREAM: af_unix: Replace BUG_ON() with WARN_ON_ONCE().
* 3c39219343 ANDROID: Enable memory controller for microdroid
* c6325b075d ANDROID: cgroup: Fix cgroup_root backport padding calculation
* 452d899d2f ANDROID: GKI: Fix up abi issue in struct scm_fp_list
* cec9cb02ce UPSTREAM: af_unix: Try to run GC async.
* 93c2d24134 BACKPORT: FROMGIT: usb: typec: tcpm: move tcpm_queue_vdm_unlocked to asynchronous work
* ee016b98b7 BACKPORT: usb: typec: tcpm: enforce ready state when queueing alt mode vdm
* 4be94a6b03 ANDROID: ABI: Update pixel symbol list
* 6af2e78f07 ANDROID: fix ABI breakage for trace_array extensions
* 6f62c0d0fb UPSTREAM: tracing: Allow creating instances with specified system events
* f8d73c6178 UPSTREAM: af_unix: Run GC on only one CPU.
* a70bd568b1 UPSTREAM: af_unix: Return struct unix_sock from unix_get_socket().
* c1b974e51d UPSTREAM: iommu: Handle race with default domain setup
* 315fdde476 ANDROID: ABI: Update pixel symbol list
* 32288ce2f2 ANDROID: vendor_hooks: Add hooks for xhci reset
* dd8fcb5398 ANDROID: GKI: deferred split queue corruption - ABI fixup
* 374babecde UPSTREAM: mm/thp: fix deferred split queue not partially_mapped: fix
* 3a8faa5b25 BACKPORT: mm/thp: fix deferred split unqueue naming and locking
* 84cc354617 UPSTREAM: mm/thp: fix deferred split queue not partially_mapped
* dd46964f3e BACKPORT: mm: add sysfs entry to disable splitting underused THPs
* 40ffd525e5 UPSTREAM: mm: split underused THPs
* a63eadb11d BACKPORT: mm: introduce a pageflag for partially mapped folios
* f1b73b0513 UPSTREAM: mm/migrate: fix kernel BUG at mm/compaction.c:2761!
* cbbd153073 BACKPORT: mm/migrate: split source folio if it is on deferred split list
* c6f085c328 BACKPORT: mm: count the number of partially mapped anonymous THPs per size
* 545db6094c BACKPORT: mm: count the number of anonymous THPs per size
* 6ee860d0d4 UPSTREAM: mm: separate out FOLIO_FLAGS from PAGEFLAGS
* f052bbc24d UPSTREAM: mm: selftest to verify zero-filled pages are mapped to zeropage
* d826c84482 BACKPORT: mm: remap unused subpages to shared zeropage when splitting isolated thp
* bc9f1a0f43 Revert "BACKPORT: mm/thp: fix deferred split unqueue naming and locking"
* c06fa3b5cd ANDROID: GKI: page_alloc ABI fixup
* 819bdc71dc BACKPORT: mm: page_alloc: batch vmstat updates in expand()
* c97dfdfac0 UPSTREAM: mm/page_alloc: keep track of free highatomic
* cdff4faf2b UPSTREAM: mm: remove unused has_isolate_pageblock
* 5b5902fcf6 UPSTREAM: mm: page_alloc: fix highatomic typing in multi-block buddies
* 48e8763a95 BACKPORT: mm: page_alloc: consolidate free page accounting
* a4f7bd4b3d BACKPORT: mm: page_isolation: prepare for hygienic freelists
* a8dcfbc68b UPSTREAM: mm: page_alloc: set migratetype inside move_freepages()
* 209c219a0f BACKPORT: mm: page_alloc: close migratetype race between freeing and stealing
* 1a3654f59a BACKPORT: mm: page_alloc: fix freelist movement during block conversion
* 861e9d3c44 UPSTREAM: mm: page_alloc: fix move_freepages_block() range error
* 350c3b1d61 UPSTREAM: mm: page_alloc: move free pages when converting block during isolation
* f76299151c UPSTREAM: mm: page_alloc: fix up block types when merging compatible blocks
* cb610236ed UPSTREAM: mm: page_alloc: optimize free_unref_folios()
* 606130dacb BACKPORT: mm: page_alloc: remove pcppage migratetype caching
* a7a880e6de UPSTREAM: mm: allow non-hugetlb large folios to be batch processed
* f17c4db9cf BACKPORT: mm: handle large folios in free_unref_folios()
* c7f67cfb85 UPSTREAM: mm: use folios_put() in __folio_batch_release()
* 445fa9a71a BACKPORT: mm: add free_unref_folios()
* cc058410b3 BACKPORT: mm: convert free_unref_page_list() to use folios
* 980cb4e2ba BACKPORT: mm: make folios_put() the basis of release_pages()
* 5f4ed005d7 Revert "BACKPORT: mm: page_alloc: remove pcppage migratetype caching"
* bab99c1b7e Revert "UPSTREAM: mm: page_alloc: fix up block types when merging compatible blocks"
* 94e3afbb3d Revert "UPSTREAM: mm: page_alloc: move free pages when converting block during isolation"
* 13aa15180a Revert "UPSTREAM: mm: page_alloc: fix move_freepages_block() range error"
* d47518de38 Revert "UPSTREAM: mm: page_alloc: fix freelist movement during block conversion"
* 135ab7374e Revert "BACKPORT: mm: page_alloc: close migratetype race between freeing and stealing"
* 9ed2d2fba2 Revert "UPSTREAM: mm: page_alloc: set migratetype inside move_freepages()"
* efbdb11ac1 Revert "BACKPORT: mm: page_isolation: prepare for hygienic freelists"
* 7d424e0f80 Revert "BACKPORT: mm: page_alloc: consolidate free page accounting"
* 8a91cd1d26 Revert "BACKPORT: mm: page_alloc: batch vmstat updates in expand()"
* be6d3cc085 Revert "UPSTREAM: mm: page_alloc: fix highatomic typing in multi-block buddies"
* bbc65a78d2 Revert "BACKPORT: mm/page_alloc: keep track of free highatomic"
* a7a0d95bca Revert "BACKPORT: mm: page_alloc: optimize free_unref_folios()"
* 8b5d78fb5c Revert "ANDROID: fuse-bpf: fix wrong logic in read backing"
* c1488e58c3 ANDROID: GKI: Update symbol list for Nvidia
* 1e3d640b05 ANDROID: GKI: Add initial Nvidia symbol list
* 5fa476bd0b ANDROID: Add ufs headers to aarch64 allowlist
* 17daf81bcc ANDROID: KVM: arm64: Allow relinqush for p-guest with huge-mappings
* 297e1ff805 ANDROID: KVM: arm64: Use unmap for pKVM guests memory relinquish
* 7c95a219c0 ANDROID: KVM: arm64: Add hyp request SPLIT
* e56d181356 ANDROID: KVM: arm64: Convert kvm_pinned_pages to an interval-tree
* 390699f93d ANDROID: KVM: arm64: Add host_split_guest for pKVM
* 16df80ab9c ANDROID: KVM: arm64: Disable relinquish for p-guest huge-mappings
* 549ac47ca0 FROMGIT: PM: runtime: fix denying of auto suspend in pm_suspend_timer_fn()
* 4cdfd02ff2 ANDROID: Enable SHA1 for microdroid
* ab0ad8d198 BACKPORT: mm: page_alloc: optimize free_unref_folios()

Change-Id: Ic5571553dd22417e2ff66c8e99c114b8d79476f2
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
This commit is contained in:
Greg Kroah-Hartman
2025-06-25 09:43:52 +00:00
87 changed files with 3280 additions and 989 deletions

View File

@@ -126,6 +126,7 @@ filegroup(
"android/abi_gki_aarch64_mtk", "android/abi_gki_aarch64_mtk",
"android/abi_gki_aarch64_mtktv", "android/abi_gki_aarch64_mtktv",
"android/abi_gki_aarch64_nothing", "android/abi_gki_aarch64_nothing",
"android/abi_gki_aarch64_nvidia",
"android/abi_gki_aarch64_oplus", "android/abi_gki_aarch64_oplus",
"android/abi_gki_aarch64_paragon", "android/abi_gki_aarch64_paragon",
"android/abi_gki_aarch64_pixel", "android/abi_gki_aarch64_pixel",
@@ -140,7 +141,7 @@ filegroup(
"android/abi_gki_aarch64_virtual_device", "android/abi_gki_aarch64_virtual_device",
"android/abi_gki_aarch64_vivo", "android/abi_gki_aarch64_vivo",
"android/abi_gki_aarch64_xiaomi", "android/abi_gki_aarch64_xiaomi",
"android/abi_gki_aarch64_xiaomi2", "android/abi_gki_aarch64_xiaomi_xring",
], ],
visibility = ["//visibility:public"], visibility = ["//visibility:public"],
) )
@@ -1028,6 +1029,9 @@ ddk_headers(
"drivers/pci/controller/dwc/pcie-designware.h", "drivers/pci/controller/dwc/pcie-designware.h",
"drivers/thermal/thermal_core.h", "drivers/thermal/thermal_core.h",
"drivers/thermal/thermal_netlink.h", "drivers/thermal/thermal_netlink.h",
"drivers/ufs/core/ufshcd-crypto.h",
"drivers/ufs/core/ufshcd-priv.h",
"drivers/ufs/host/ufshcd-pltfrm.h",
"drivers/usb/dwc3/core.h", "drivers/usb/dwc3/core.h",
"sound/usb/card.h", "sound/usb/card.h",
"sound/usb/usbaudio.h", "sound/usb/usbaudio.h",
@@ -1045,6 +1049,7 @@ ddk_headers(
"drivers/extcon", "drivers/extcon",
"drivers/pci/controller/dwc", "drivers/pci/controller/dwc",
"drivers/thermal", "drivers/thermal",
"drivers/ufs",
"drivers/usb", "drivers/usb",
"sound/usb", "sound/usb",
"include", "include",

View File

@@ -202,6 +202,16 @@ PMD-mappable transparent hugepage::
cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size
All THPs at fault and collapse time will be added to _deferred_list,
and will therefore be split under memory presure if they are considered
"underused". A THP is underused if the number of zero-filled pages in
the THP is above max_ptes_none (see below). It is possible to disable
this behaviour by writing 0 to shrink_underused, and enable it by writing
1 to it::
echo 0 > /sys/kernel/mm/transparent_hugepage/shrink_underused
echo 1 > /sys/kernel/mm/transparent_hugepage/shrink_underused
khugepaged will be automatically started when one or more hugepage khugepaged will be automatically started when one or more hugepage
sizes are enabled (either by directly setting "always" or "madvise", sizes are enabled (either by directly setting "always" or "madvise",
or by setting "inherit" while the top-level enabled is set to "always" or by setting "inherit" while the top-level enabled is set to "always"
@@ -443,6 +453,12 @@ thp_deferred_split_page
splitting it would free up some memory. Pages on split queue are splitting it would free up some memory. Pages on split queue are
going to be split under memory pressure. going to be split under memory pressure.
thp_underused_split_page
is incremented when a huge page on the split queue was split
because it was underused. A THP is underused if the number of
zero pages in the THP is above a certain threshold
(/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none).
thp_split_pmd thp_split_pmd
is incremented every time a PMD split into table of PTEs. is incremented every time a PMD split into table of PTEs.
This can happen, for instance, when application calls mprotect() or This can happen, for instance, when application calls mprotect() or
@@ -510,6 +526,18 @@ split_deferred
it would free up some memory. Pages on split queue are going to it would free up some memory. Pages on split queue are going to
be split under memory pressure, if splitting is possible. be split under memory pressure, if splitting is possible.
nr_anon
the number of anonymous THP we have in the whole system. These THPs
might be currently entirely mapped or have partially unmapped/unused
subpages.
nr_anon_partially_mapped
the number of anonymous THP which are likely partially mapped, possibly
wasting memory, and have been queued for deferred memory reclamation.
Note that in corner some cases (e.g., failed migration), we might detect
an anonymous THP as "partially mapped" and count it here, even though it
is not actually partially mapped anymore.
As the system ages, allocating huge pages may be expensive as the As the system ages, allocating huge pages may be expensive as the
system uses memory compaction to copy data around memory to free a system uses memory compaction to copy data around memory to free a
huge page for use. There are some counters in ``/proc/vmstat`` to help huge page for use. There are some counters in ``/proc/vmstat`` to help

File diff suppressed because it is too large Load Diff

View File

@@ -132,3 +132,84 @@ type 'struct io_ring_ctx' changed
1 variable symbol(s) removed 1 variable symbol(s) removed
'struct tracepoint __tracepoint_android_vh_filemap_fault_before_folio_locked' 'struct tracepoint __tracepoint_android_vh_filemap_fault_before_folio_locked'
type 'struct kvm_protected_vm' changed
member 'struct maple_tree pinned_pages' was removed
member 'union { struct rb_root_cached pinned_pages; struct { struct maple_tree __unused; }; union { }; }' was added
type 'struct kvm_hyp_req' changed
member changed from 'union { struct { u8 dest; int nr_pages; int sz_alloc; } mem; struct { unsigned long guest_ipa; size_t size; } map; }' to 'union { struct { u8 dest; int nr_pages; int sz_alloc; } mem; struct { unsigned long guest_ipa; size_t size; } map; struct { unsigned long guest_ipa; size_t size; } split; }'
type changed from 'union { struct { u8 dest; int nr_pages; int sz_alloc; } mem; struct { unsigned long guest_ipa; size_t size; } map; }' to 'union { struct { u8 dest; int nr_pages; int sz_alloc; } mem; struct { unsigned long guest_ipa; size_t size; } map; struct { unsigned long guest_ipa; size_t size; } split; }'
member 'struct { unsigned long guest_ipa; size_t size; } split' was added
type 'struct scm_fp_list' changed
byte size changed from 2040 to 2048
member 'short count_unix' was added
type 'struct scm_fp_list' changed
byte size changed from 2048 to 2064
member 'struct list_head vertices' was added
member 'short count_unix' changed
offset changed by 128
type 'struct scm_fp_list' changed
byte size changed from 2064 to 2072
member 'struct unix_edge* edges' was added
member 'short count_unix' changed
offset changed by 64
type 'struct scm_fp_list' changed
byte size changed from 2072 to 2080
member 'bool inflight' was added
3 members ('struct list_head vertices' .. 'short count_unix') changed
offset changed by 64
type 'struct unix_edge' changed
byte size changed from 32 to 48
member 'struct list_head stack_entry' was added
type 'struct unix_vertex' changed
byte size changed from 40 to 48
member 'unsigned long index' was added
type 'struct unix_vertex' changed
byte size changed from 48 to 80
member 'struct list_head scc_entry' was added
2 members ('unsigned long out_degree' .. 'unsigned long index') changed
offset changed by 128
member 'unsigned long lowlink' was added
member 'bool on_stack' was added
type 'struct unix_sock' changed
member 'struct sock* listener' was added
4 members ('struct list_head link' .. 'unsigned long gc_flags') changed
offset changed by 64
type 'struct unix_vertex' changed
byte size changed from 80 to 72
member 'bool on_stack' was removed
type 'struct unix_vertex' changed
member 'unsigned long lowlink' was removed
member 'unsigned long scc_index' was added
type 'struct unix_sock' changed
byte size changed from 1216 to 1152
member 'struct list_head link' was removed
member 'unsigned long inflight' was removed
member 'spinlock_t lock' changed
offset changed by -192
member 'unsigned long gc_flags' was removed
4 members ('struct socket_wq peer_wq' .. 'struct sk_buff* oob_skb') changed
offset changed by -512
type 'struct unix_sock' changed
member 'struct sk_buff* oob_skb' changed
offset changed by 64
type 'struct scm_stat' changed
byte size changed from 4 to 16
member 'unsigned long nr_unix_fds' was added
type 'struct scm_fp_list' changed
member 'bool dead' was added

View File

@@ -0,0 +1,232 @@
[abi_symbol_list]
# commonly used symbols
alloc_chrdev_region
alt_cb_patch_nops
__arch_copy_from_user
__arch_copy_to_user
cdev_add
cdev_del
cdev_init
__check_object_size
class_create
class_destroy
complete
dev_driver_string
_dev_err
device_create
device_destroy
_dev_info
devm_kfree
devm_kmalloc
devm_memremap
devm_request_threaded_irq
_dev_warn
fortify_panic
free_irq
__init_swait_queue_head
init_timer_key
__init_waitqueue_head
jiffies_to_usecs
kfree
__kmalloc
kmalloc_caches
kmalloc_trace
kstrtouint
log_post_read_mmio
log_read_mmio
memcpy
__memcpy_fromio
memset
module_layout
__mutex_init
mutex_lock
mutex_unlock
of_find_property
of_property_read_u32_index
of_property_read_variable_u32_array
panic
pid_task
__platform_driver_register
platform_driver_unregister
_printk
__put_task_struct
_raw_spin_lock
_raw_spin_unlock
request_threaded_irq
schedule_timeout
snprintf
__stack_chk_fail
strlen
strncmp
strnlen
strscpy
sysfs_create_group
sysfs_remove_group
system_cpucaps
system_wq
tegra_ivc_notified
tegra_ivc_read_advance
tegra_ivc_read_get_next_frame
tegra_ivc_reset
tegra_ivc_write_advance
tegra_ivc_write_get_next_frame
__traceiter_rwmmio_post_read
__traceiter_rwmmio_read
__tracepoint_rwmmio_post_read
__tracepoint_rwmmio_read
unregister_chrdev_region
__wake_up
__warn_printk
# required by ivc-cdev.ko
device_del
devm_free_irq
noop_llseek
remap_pfn_range
# required by ivc_ext.ko
dma_sync_single_for_cpu
__memcpy_toio
# required by nvsciipc.ko
_dev_notice
__fdget
find_get_pid
fput
platform_device_register_full
platform_device_unregister
sprintf
# required by tegra_bpmp.ko
clk_hw_determine_rate_no_reparent
clk_hw_get_name
clk_hw_unregister
debugfs_create_dir
debugfs_create_file
debugfs_remove
dentry_path_raw
devm_clk_hw_register
devm_reset_controller_register
dma_alloc_attrs
dma_free_attrs
_find_next_bit
kmalloc_large
kstrdup
ktime_get
of_clk_add_hw_provider
of_device_get_match_data
of_genpd_add_provider_onecell
__of_parse_phandle_with_args
of_platform_default_populate
pm_genpd_init
pm_genpd_remove
seq_lseek
seq_read
seq_write
single_open_size
single_release
strncpy
tegra_bpmp_free_mrq
tegra_bpmp_mrq_is_supported
tegra_bpmp_mrq_return
tegra_bpmp_request_mrq
tegra_bpmp_transfer
tegra_bpmp_transfer_atomic
tegra_sku_info
# required by tegra_hv.ko
arm64_use_ng_mappings
class_create_file_ns
ioremap_prot
iounmap
irq_get_irq_data
memstart_addr
of_add_property
of_chosen
of_find_compatible_node
of_irq_get
pfn_is_map_memory
tegra_ivc_init
# required by tegra_hv_pm_ctl.ko
__alloc_skb
find_vpid
finish_wait
init_net
init_wait_entry
msleep
__netlink_kernel_create
netlink_unicast
__nlmsg_put
prepare_to_wait_event
register_pm_notifier
schedule
strcmp
wait_for_completion_timeout
# required by tegra_hv_vblk_oops.ko
delayed_work_timer_fn
dma_map_page_attrs
__get_free_pages
is_vmalloc_addr
queue_delayed_work_on
# required by tegra_vblk.ko
blk_execute_rq
blk_mq_alloc_disk_for_queue
blk_mq_alloc_request
blk_mq_alloc_tag_set
blk_mq_destroy_queue
blk_mq_end_request
blk_mq_free_request
blk_mq_free_tag_set
blk_mq_init_queue
blk_mq_start_hw_queues
blk_mq_start_request
blk_mq_stop_hw_queues
blk_queue_flag_set
blk_queue_logical_block_size
blk_queue_max_discard_sectors
blk_queue_max_hw_sectors
blk_queue_max_secure_erase_sectors
blk_queue_physical_block_size
blk_queue_write_cache
__blk_rq_map_sg
capable
__cpu_possible_mask
del_gendisk
device_add_disk
device_create_file
disable_irq
disk_check_media_change
dma_map_sg_attrs
dma_unmap_sg_attrs
enable_irq
_find_first_zero_bit
jiffies
kasan_flag_enabled
kthread_create_on_cpu
kthread_create_on_node
__list_add_valid_or_report
__list_del_entry_valid_or_report
mod_timer
__num_online_cpus
of_find_node_by_name
put_disk
queue_work_on
_raw_spin_lock_irqsave
_raw_spin_unlock_irqrestore
__register_blkdev
sched_setattr_nocheck
set_capacity
set_disk_ro
sg_init_table
sg_nents
__sw_hweight64
timer_delete
unregister_blkdev
vfree
vzalloc
wait_for_completion
wait_for_completion_interruptible
wake_up_process

View File

@@ -883,6 +883,7 @@
drm_mode_duplicate drm_mode_duplicate
drm_mode_equal drm_mode_equal
drm_mode_equal_no_clocks drm_mode_equal_no_clocks
drm_mode_is_420_only
drm_mode_object_find drm_mode_object_find
drm_mode_object_get drm_mode_object_get
drm_mode_object_put drm_mode_object_put
@@ -2620,6 +2621,7 @@
touch_softlockup_watchdog touch_softlockup_watchdog
trace_array_destroy trace_array_destroy
trace_array_get_by_name trace_array_get_by_name
trace_array_get_by_name_ext
trace_array_put trace_array_put
trace_array_set_clr_event trace_array_set_clr_event
trace_event_buffer_commit trace_event_buffer_commit
@@ -2731,6 +2733,7 @@
__traceiter_android_vh_ufs_update_sysfs __traceiter_android_vh_ufs_update_sysfs
__traceiter_android_vh_usb_dev_resume __traceiter_android_vh_usb_dev_resume
__traceiter_android_vh_use_amu_fie __traceiter_android_vh_use_amu_fie
__traceiter_android_vh_xhci_full_reset_on_remove
__traceiter_clock_set_rate __traceiter_clock_set_rate
__traceiter_cma_alloc_finish __traceiter_cma_alloc_finish
__traceiter_cma_alloc_start __traceiter_cma_alloc_start
@@ -2869,6 +2872,7 @@
__tracepoint_android_vh_ufs_update_sysfs __tracepoint_android_vh_ufs_update_sysfs
__tracepoint_android_vh_usb_dev_resume __tracepoint_android_vh_usb_dev_resume
__tracepoint_android_vh_use_amu_fie __tracepoint_android_vh_use_amu_fie
__tracepoint_android_vh_xhci_full_reset_on_remove
__tracepoint_clock_set_rate __tracepoint_clock_set_rate
__tracepoint_cma_alloc_finish __tracepoint_cma_alloc_finish
__tracepoint_cma_alloc_start __tracepoint_cma_alloc_start

View File

@@ -154,6 +154,8 @@
__traceiter_android_vh_look_around_migrate_folio __traceiter_android_vh_look_around_migrate_folio
__traceiter_android_vh_lruvec_add_folio __traceiter_android_vh_lruvec_add_folio
__traceiter_android_vh_lruvec_del_folio __traceiter_android_vh_lruvec_del_folio
__traceiter_android_vh_mempool_alloc_skip_wait
__traceiter_android_vh_mm_free_page
__traceiter_android_vh_mmap_region __traceiter_android_vh_mmap_region
__traceiter_android_vh_mutex_init __traceiter_android_vh_mutex_init
__traceiter_android_vh_mutex_unlock_slowpath __traceiter_android_vh_mutex_unlock_slowpath
@@ -284,6 +286,8 @@
__tracepoint_android_vh_look_around_migrate_folio __tracepoint_android_vh_look_around_migrate_folio
__tracepoint_android_vh_lruvec_add_folio __tracepoint_android_vh_lruvec_add_folio
__tracepoint_android_vh_lruvec_del_folio __tracepoint_android_vh_lruvec_del_folio
__tracepoint_android_vh_mempool_alloc_skip_wait
__tracepoint_android_vh_mm_free_page
__tracepoint_android_vh_mmap_region __tracepoint_android_vh_mmap_region
__tracepoint_android_vh_mutex_init __tracepoint_android_vh_mutex_init
__tracepoint_android_vh_mutex_unlock_slowpath __tracepoint_android_vh_mutex_unlock_slowpath

View File

@@ -23,6 +23,8 @@
__tracepoint_android_vh_tune_swappiness __tracepoint_android_vh_tune_swappiness
__traceiter_android_vh_do_shrink_slab_ex __traceiter_android_vh_do_shrink_slab_ex
__tracepoint_android_vh_do_shrink_slab_ex __tracepoint_android_vh_do_shrink_slab_ex
__traceiter_android_vh_migration_target_bypass
__tracepoint_android_vh_migration_target_bypass
# required by lz4 decompress module # required by lz4 decompress module
__tracepoint_android_vh_lz4_decompress_bypass __tracepoint_android_vh_lz4_decompress_bypass

View File

@@ -1911,6 +1911,7 @@
scsi_report_bus_reset scsi_report_bus_reset
scsi_scan_host scsi_scan_host
scsi_unblock_requests scsi_unblock_requests
scsi_host_busy
sdev_prefix_printk sdev_prefix_printk
security_file_ioctl security_file_ioctl
select_fallback_rq select_fallback_rq

View File

@@ -737,6 +737,7 @@ CONFIG_CRYPTO_LZ4=y
CONFIG_CRYPTO_ZSTD=y CONFIG_CRYPTO_ZSTD=y
CONFIG_CRYPTO_ANSI_CPRNG=y CONFIG_CRYPTO_ANSI_CPRNG=y
CONFIG_CRYPTO_GHASH_ARM64_CE=y CONFIG_CRYPTO_GHASH_ARM64_CE=y
CONFIG_CRYPTO_SHA1_ARM64_CE=y
CONFIG_CRYPTO_SHA2_ARM64_CE=y CONFIG_CRYPTO_SHA2_ARM64_CE=y
CONFIG_CRYPTO_SHA512_ARM64_CE=y CONFIG_CRYPTO_SHA512_ARM64_CE=y
CONFIG_CRYPTO_POLYVAL_ARM64_CE=y CONFIG_CRYPTO_POLYVAL_ARM64_CE=y

View File

@@ -8,6 +8,8 @@ CONFIG_RCU_EXPERT=y
CONFIG_IKCONFIG=y CONFIG_IKCONFIG=y
CONFIG_IKCONFIG_PROC=y CONFIG_IKCONFIG_PROC=y
CONFIG_LOG_BUF_SHIFT=14 CONFIG_LOG_BUF_SHIFT=14
CONFIG_CGROUPS=y
CONFIG_MEMCG=y
# CONFIG_RD_GZIP is not set # CONFIG_RD_GZIP is not set
# CONFIG_RD_BZIP2 is not set # CONFIG_RD_BZIP2 is not set
# CONFIG_RD_LZMA is not set # CONFIG_RD_LZMA is not set
@@ -136,8 +138,10 @@ CONFIG_STATIC_USERMODEHELPER_PATH=""
CONFIG_SECURITY_SELINUX=y CONFIG_SECURITY_SELINUX=y
CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y
CONFIG_BUG_ON_DATA_CORRUPTION=y CONFIG_BUG_ON_DATA_CORRUPTION=y
CONFIG_CRYPTO_SHA1=y
CONFIG_CRYPTO_HCTR2=y CONFIG_CRYPTO_HCTR2=y
CONFIG_CRYPTO_LZO=y CONFIG_CRYPTO_LZO=y
CONFIG_CRYPTO_SHA1_ARM64_CE=y
CONFIG_CRYPTO_SHA2_ARM64_CE=y CONFIG_CRYPTO_SHA2_ARM64_CE=y
CONFIG_CRYPTO_POLYVAL_ARM64_CE=y CONFIG_CRYPTO_POLYVAL_ARM64_CE=y
CONFIG_CRYPTO_AES_ARM64_CE_BLK=y CONFIG_CRYPTO_AES_ARM64_CE_BLK=y

View File

@@ -83,6 +83,7 @@ enum __kvm_host_smccc_func {
__KVM_HOST_SMCCC_FUNC___pkvm_relax_perms, __KVM_HOST_SMCCC_FUNC___pkvm_relax_perms,
__KVM_HOST_SMCCC_FUNC___pkvm_wrprotect, __KVM_HOST_SMCCC_FUNC___pkvm_wrprotect,
__KVM_HOST_SMCCC_FUNC___pkvm_dirty_log, __KVM_HOST_SMCCC_FUNC___pkvm_dirty_log,
__KVM_HOST_SMCCC_FUNC___pkvm_host_split_guest,
__KVM_HOST_SMCCC_FUNC___pkvm_tlb_flush_vmid, __KVM_HOST_SMCCC_FUNC___pkvm_tlb_flush_vmid,
__KVM_HOST_SMCCC_FUNC___kvm_adjust_pc, __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc,
__KVM_HOST_SMCCC_FUNC___kvm_vcpu_run, __KVM_HOST_SMCCC_FUNC___kvm_vcpu_run,

View File

@@ -224,20 +224,36 @@ struct kvm_smccc_features {
}; };
struct kvm_pinned_page { struct kvm_pinned_page {
union {
struct rb_node node;
struct list_head list_node;
};
struct page *page; struct page *page;
u64 ipa; u64 ipa;
u64 __subtree_last;
u8 order; u8 order;
u16 pins; u16 pins;
}; };
#define KVM_DUMMY_PPAGE ((struct kvm_pinned_page *)-1) struct kvm_pinned_page
*kvm_pinned_pages_iter_first(struct rb_root_cached *root, u64 start, u64 end);
struct kvm_pinned_page
*kvm_pinned_pages_iter_next(struct kvm_pinned_page *ppage, u64 start, u64 end);
#define for_ppage_node_in_range(kvm, start, end, __ppage, __tmp) \
for (__ppage = kvm_pinned_pages_iter_first(&(kvm)->arch.pkvm.pinned_pages, start, end - 1);\
__ppage && ({ __tmp = kvm_pinned_pages_iter_next(__ppage, start, end - 1); 1; }); \
__ppage = __tmp)
void kvm_pinned_pages_remove(struct kvm_pinned_page *ppage,
struct rb_root_cached *root);
typedef unsigned int pkvm_handle_t; typedef unsigned int pkvm_handle_t;
struct kvm_protected_vm { struct kvm_protected_vm {
pkvm_handle_t handle; pkvm_handle_t handle;
struct kvm_hyp_memcache stage2_teardown_mc; struct kvm_hyp_memcache stage2_teardown_mc;
struct maple_tree pinned_pages; _ANDROID_KABI_REPLACE(struct maple_tree __unused, struct rb_root_cached pinned_pages);
gpa_t pvmfw_load_addr; gpa_t pvmfw_load_addr;
bool enabled; bool enabled;
}; };
@@ -525,6 +541,7 @@ struct kvm_hyp_req {
#define KVM_HYP_LAST_REQ 0 #define KVM_HYP_LAST_REQ 0
#define KVM_HYP_REQ_TYPE_MEM 1 #define KVM_HYP_REQ_TYPE_MEM 1
#define KVM_HYP_REQ_TYPE_MAP 2 #define KVM_HYP_REQ_TYPE_MAP 2
#define KVM_HYP_REQ_TYPE_SPLIT 3
u8 type; u8 type;
union { union {
struct { struct {
@@ -539,6 +556,12 @@ struct kvm_hyp_req {
unsigned long guest_ipa; unsigned long guest_ipa;
size_t size; size_t size;
} map; } map;
#ifndef __GENKSYMS__
struct {
unsigned long guest_ipa;
size_t size;
} split;
#endif
}; };
}; };

View File

@@ -184,6 +184,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
int kvm_handle_guest_abort(struct kvm_vcpu *vcpu); int kvm_handle_guest_abort(struct kvm_vcpu *vcpu);
int pkvm_mem_abort_range(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, size_t size); int pkvm_mem_abort_range(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, size_t size);
int __pkvm_pgtable_stage2_split(struct kvm_vcpu *vcpu, phys_addr_t ipa, size_t size);
phys_addr_t kvm_mmu_get_httbr(void); phys_addr_t kvm_mmu_get_httbr(void);
phys_addr_t kvm_get_idmap_vector(void); phys_addr_t kvm_get_idmap_vector(void);

View File

@@ -862,8 +862,7 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
* kvm_pgtable_stage2_split() is best effort: it tries to break as many * kvm_pgtable_stage2_split() is best effort: it tries to break as many
* blocks in the input range as allowed by @mc_capacity. * blocks in the input range as allowed by @mc_capacity.
*/ */
int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, void *mc);
struct kvm_mmu_memory_cache *mc);
/** /**
* kvm_pgtable_walk() - Walk a page-table. * kvm_pgtable_walk() - Walk a page-table.

View File

@@ -363,6 +363,11 @@ static int handle_hyp_req_map(struct kvm_vcpu *vcpu,
return pkvm_mem_abort_range(vcpu, req->map.guest_ipa, req->map.size); return pkvm_mem_abort_range(vcpu, req->map.guest_ipa, req->map.size);
} }
static int handle_hyp_req_split(struct kvm_vcpu *vcpu, struct kvm_hyp_req *req)
{
return __pkvm_pgtable_stage2_split(vcpu, req->split.guest_ipa, req->split.size);
}
static int handle_hyp_req(struct kvm_vcpu *vcpu) static int handle_hyp_req(struct kvm_vcpu *vcpu)
{ {
struct kvm_hyp_req *hyp_req = vcpu->arch.hyp_reqs; struct kvm_hyp_req *hyp_req = vcpu->arch.hyp_reqs;
@@ -379,6 +384,9 @@ static int handle_hyp_req(struct kvm_vcpu *vcpu)
case KVM_HYP_REQ_TYPE_MAP: case KVM_HYP_REQ_TYPE_MAP:
ret = handle_hyp_req_map(vcpu, hyp_req); ret = handle_hyp_req_map(vcpu, hyp_req);
break; break;
case KVM_HYP_REQ_TYPE_SPLIT:
ret = handle_hyp_req_split(vcpu, hyp_req);
break;
default: default:
pr_warn("Unknown kvm_hyp_req type: %d\n", hyp_req->type); pr_warn("Unknown kvm_hyp_req type: %d\n", hyp_req->type);
ret = -EINVAL; ret = -EINVAL;

View File

@@ -63,6 +63,7 @@ int __pkvm_host_unuse_dma(u64 phys_addr, size_t size);
int __pkvm_guest_stage2_snapshot(struct kvm_pgtable_snapshot *snap, struct pkvm_hyp_vm *vm); int __pkvm_guest_stage2_snapshot(struct kvm_pgtable_snapshot *snap, struct pkvm_hyp_vm *vm);
int __pkvm_host_stage2_snapshot(struct kvm_pgtable_snapshot *snap); int __pkvm_host_stage2_snapshot(struct kvm_pgtable_snapshot *snap);
int __pkvm_host_lazy_pte(u64 pfn, u64 nr_pages, bool enable); int __pkvm_host_lazy_pte(u64 pfn, u64 nr_pages, bool enable);
int __pkvm_host_split_guest(u64 pfn, u64 gfn, u64 size, struct pkvm_hyp_vcpu *vcpu);
bool addr_is_memory(phys_addr_t phys); bool addr_is_memory(phys_addr_t phys);
int host_stage2_idmap_locked(phys_addr_t addr, u64 size, int host_stage2_idmap_locked(phys_addr_t addr, u64 size,

View File

@@ -556,7 +556,7 @@ void *hyp_alloc(size_t size)
unsigned long chunk_addr; unsigned long chunk_addr;
int missing_map, ret = 0; int missing_map, ret = 0;
size = ALIGN(size, MIN_ALLOC); size = ALIGN(size ?: MIN_ALLOC, MIN_ALLOC);
hyp_spin_lock(&allocator->lock); hyp_spin_lock(&allocator->lock);

View File

@@ -1073,6 +1073,27 @@ out:
cpu_reg(host_ctxt, 1) = ret; cpu_reg(host_ctxt, 1) = ret;
} }
static void handle___pkvm_host_split_guest(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(u64, pfn, host_ctxt, 1);
DECLARE_REG(u64, gfn, host_ctxt, 2);
DECLARE_REG(u64, size, host_ctxt, 3);
struct pkvm_hyp_vcpu *hyp_vcpu;
int ret = -EINVAL;
if (!is_protected_kvm_enabled())
goto out;
hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
if (!hyp_vcpu)
goto out;
ret = __pkvm_host_split_guest(pfn, gfn, size, hyp_vcpu);
out:
cpu_reg(host_ctxt, 1) = ret;
}
static void handle___kvm_adjust_pc(struct kvm_cpu_context *host_ctxt) static void handle___kvm_adjust_pc(struct kvm_cpu_context *host_ctxt)
{ {
struct pkvm_hyp_vcpu *hyp_vcpu; struct pkvm_hyp_vcpu *hyp_vcpu;
@@ -1618,6 +1639,7 @@ static const hcall_t host_hcall[] = {
HANDLE_FUNC(__pkvm_relax_perms), HANDLE_FUNC(__pkvm_relax_perms),
HANDLE_FUNC(__pkvm_wrprotect), HANDLE_FUNC(__pkvm_wrprotect),
HANDLE_FUNC(__pkvm_dirty_log), HANDLE_FUNC(__pkvm_dirty_log),
HANDLE_FUNC(__pkvm_host_split_guest),
HANDLE_FUNC(__pkvm_tlb_flush_vmid), HANDLE_FUNC(__pkvm_tlb_flush_vmid),
HANDLE_FUNC(__kvm_adjust_pc), HANDLE_FUNC(__kvm_adjust_pc),
HANDLE_FUNC(__kvm_vcpu_run), HANDLE_FUNC(__kvm_vcpu_run),

View File

@@ -387,6 +387,10 @@ static int relinquish_walker(const struct kvm_pgtable_visit_ctx *ctx,
if (!kvm_pte_valid(pte)) if (!kvm_pte_valid(pte))
return 0; return 0;
/* We don't support splitting non-leaf mappings */
if (ctx->level != (KVM_PGTABLE_MAX_LEVELS - 1))
return -E2BIG;
state = pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte)); state = pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte));
if (state != data->expected_state) if (state != data->expected_state)
return -EPERM; return -EPERM;
@@ -433,8 +437,7 @@ int __pkvm_guest_relinquish_to_host(struct pkvm_hyp_vcpu *vcpu,
goto end; goto end;
/* Zap the guest stage2 pte and return ownership to the host */ /* Zap the guest stage2 pte and return ownership to the host */
ret = kvm_pgtable_stage2_annotate(&vm->pgt, ipa, PAGE_SIZE, ret = kvm_pgtable_stage2_unmap(&vm->pgt, ipa, PAGE_SIZE);
&vcpu->vcpu.arch.stage2_mc, 0);
if (ret) if (ret)
goto end; goto end;
@@ -2760,6 +2763,30 @@ unlock:
} }
int __pkvm_host_split_guest(u64 pfn, u64 gfn, u64 size, struct pkvm_hyp_vcpu *vcpu)
{
struct kvm_hyp_memcache *mc = &vcpu->vcpu.arch.stage2_mc;
struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
u64 ipa = hyp_pfn_to_phys(gfn);
int ret;
if (size != PMD_SIZE)
return -EINVAL;
guest_lock_component(vm);
/*
* stage2_split() already checks the existing mapping is valid and PMD-level.
* No other check is necessary.
*/
ret = kvm_pgtable_stage2_split(&vm->pgt, ipa, size, mc);
guest_unlock_component(vm);
return ret;
}
int __pkvm_host_donate_guest(struct pkvm_hyp_vcpu *vcpu, u64 pfn, u64 gfn, int __pkvm_host_donate_guest(struct pkvm_hyp_vcpu *vcpu, u64 pfn, u64 gfn,
u64 nr_pages) u64 nr_pages)
{ {

View File

@@ -702,16 +702,13 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu,
if (ret) if (ret)
goto done; goto done;
ret = pkvm_vcpu_init_psci(hyp_vcpu);
if (ret)
goto done;
if (test_bit(KVM_ARM_VCPU_SVE, hyp_vcpu->vcpu.arch.features)) { if (test_bit(KVM_ARM_VCPU_SVE, hyp_vcpu->vcpu.arch.features)) {
ret = init_pkvm_hyp_vcpu_sve(hyp_vcpu, host_vcpu); ret = init_pkvm_hyp_vcpu_sve(hyp_vcpu, host_vcpu);
if (ret) if (ret)
goto done; goto done;
} }
WARN_ON(pkvm_vcpu_init_psci(hyp_vcpu));
pkvm_vcpu_init_traps(hyp_vcpu); pkvm_vcpu_init_traps(hyp_vcpu);
kvm_reset_pvm_sys_regs(&hyp_vcpu->vcpu); kvm_reset_pvm_sys_regs(&hyp_vcpu->vcpu);
done: done:
@@ -1588,9 +1585,19 @@ static bool pkvm_memrelinquish_call(struct pkvm_hyp_vcpu *hyp_vcpu,
goto out_guest_err; goto out_guest_err;
ret = __pkvm_guest_relinquish_to_host(hyp_vcpu, ipa, &pa); ret = __pkvm_guest_relinquish_to_host(hyp_vcpu, ipa, &pa);
if (ret == -ENOMEM) { if (ret == -E2BIG) {
if (pkvm_handle_empty_memcache(hyp_vcpu, exit_code)) struct kvm_hyp_req *req = pkvm_hyp_req_reserve(hyp_vcpu, KVM_HYP_REQ_TYPE_SPLIT);
if (!req) {
ret = -ENOMEM;
goto out_guest_err; goto out_guest_err;
}
req->split.guest_ipa = ALIGN_DOWN(ipa, PMD_SIZE);
req->split.size = PMD_SIZE;
write_sysreg_el2(read_sysreg_el2(SYS_ELR) - 4, SYS_ELR);
*exit_code = ARM_EXCEPTION_HYP_REQ;
return false; return false;
} else if (ret) { } else if (ret) {

View File

@@ -1769,13 +1769,49 @@ static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
return 0; return 0;
} }
int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, static int pkvm_stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
struct kvm_mmu_memory_cache *mc) enum kvm_pgtable_walk_flags visit)
{ {
struct stage2_map_data *data = ctx->arg;
struct kvm_pgtable *pgt = data->mmu->pgt;
struct kvm_hyp_memcache *mc = data->memcache;
enum kvm_pgtable_prot prot;
kvm_pte_t pte = ctx->old;
kvm_pte_t *childp;
if (ctx->level == KVM_PGTABLE_MAX_LEVELS - 1)
return 0;
/* We can only split PMD-level blocks */
if (!kvm_pte_valid(pte) || ctx->level != KVM_PGTABLE_MAX_LEVELS - 2)
return -EINVAL;
prot = kvm_pgtable_stage2_pte_prot(pte);
childp = kvm_pgtable_stage2_create_unlinked(pgt, kvm_pte_to_phys(pte),
ctx->level, prot, mc, true);
if (IS_ERR(childp))
return PTR_ERR(childp);
WARN_ON(!stage2_try_break_pte(ctx, data->mmu));
stage2_make_pte(ctx, kvm_init_table_pte(childp, ctx->mm_ops));
dsb(ishst);
return 0;
}
int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, void *mc)
{
struct stage2_map_data data = {
.mmu = pgt->mmu,
.memcache = mc,
};
struct kvm_pgtable_walker walker = { struct kvm_pgtable_walker walker = {
.cb = stage2_split_walker, .cb = static_branch_unlikely(&kvm_protected_mode_initialized) ?
pkvm_stage2_split_walker : stage2_split_walker,
.arg = static_branch_unlikely(&kvm_protected_mode_initialized) ?
&data : mc,
.flags = KVM_PGTABLE_WALK_LEAF, .flags = KVM_PGTABLE_WALK_LEAF,
.arg = mc,
}; };
return kvm_pgtable_walk(pgt, addr, size, &walker); return kvm_pgtable_walk(pgt, addr, size, &walker);

View File

@@ -6,11 +6,11 @@
#include <linux/cma.h> #include <linux/cma.h>
#include <linux/dma-map-ops.h> #include <linux/dma-map-ops.h>
#include <linux/maple_tree.h>
#include <linux/mman.h> #include <linux/mman.h>
#include <linux/kvm_host.h> #include <linux/kvm_host.h>
#include <linux/io.h> #include <linux/io.h>
#include <linux/hugetlb.h> #include <linux/hugetlb.h>
#include <linux/interval_tree_generic.h>
#include <linux/sched/signal.h> #include <linux/sched/signal.h>
#include <trace/events/kvm.h> #include <trace/events/kvm.h>
#include <asm/pgalloc.h> #include <asm/pgalloc.h>
@@ -291,6 +291,20 @@ static void invalidate_icache_guest_page(void *va, size_t size)
__invalidate_icache_guest_page(va, size); __invalidate_icache_guest_page(va, size);
} }
static u64 __pinned_page_start(struct kvm_pinned_page *ppage)
{
return ppage->ipa;
}
static u64 __pinned_page_end(struct kvm_pinned_page *ppage)
{
return ppage->ipa + (1 << (ppage->order + PAGE_SHIFT)) - 1;
}
INTERVAL_TREE_DEFINE(struct kvm_pinned_page, node, u64, __subtree_last,
__pinned_page_start, __pinned_page_end, /* empty */,
kvm_pinned_pages);
static int __pkvm_unmap_guest_call(u64 pfn, u64 gfn, u8 order, void *args) static int __pkvm_unmap_guest_call(u64 pfn, u64 gfn, u8 order, void *args)
{ {
struct kvm *kvm = args; struct kvm *kvm = args;
@@ -312,7 +326,7 @@ static int pkvm_unmap_guest(struct kvm *kvm, struct kvm_pinned_page *ppage)
* no update needed from here. * no update needed from here.
*/ */
unpin_user_pages(&ppage->page, 1); unpin_user_pages(&ppage->page, 1);
mtree_erase(&kvm->arch.pkvm.pinned_pages, ppage->ipa); kvm_pinned_pages_remove(ppage, &kvm->arch.pkvm.pinned_pages);
kfree(ppage); kfree(ppage);
return 0; return 0;
@@ -320,17 +334,12 @@ static int pkvm_unmap_guest(struct kvm *kvm, struct kvm_pinned_page *ppage)
static int pkvm_unmap_range(struct kvm *kvm, u64 start, u64 end) static int pkvm_unmap_range(struct kvm *kvm, u64 start, u64 end)
{ {
struct kvm_pinned_page *ppage, *tmp;
struct mm_struct *mm = kvm->mm; struct mm_struct *mm = kvm->mm;
unsigned long index = start;
unsigned long cnt = 0; unsigned long cnt = 0;
void *entry;
int ret = 0; int ret = 0;
mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, end - 1) { for_ppage_node_in_range(kvm, start, end, ppage, tmp) {
struct kvm_pinned_page *ppage = entry;
if (ppage == KVM_DUMMY_PPAGE)
continue;
ret = pkvm_unmap_guest(kvm, ppage); ret = pkvm_unmap_guest(kvm, ppage);
if (ret) if (ret)
break; break;
@@ -418,8 +427,7 @@ static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 si
static void pkvm_stage2_flush(struct kvm *kvm) static void pkvm_stage2_flush(struct kvm *kvm)
{ {
unsigned long index = 0; struct kvm_pinned_page *ppage, *tmp;
void *entry;
/* /*
* Contrary to stage2_apply_range(), we don't need to check * Contrary to stage2_apply_range(), we don't need to check
@@ -427,11 +435,7 @@ static void pkvm_stage2_flush(struct kvm *kvm)
* from a vcpu thread, and the list is only ever freed on VM * from a vcpu thread, and the list is only ever freed on VM
* destroy (which only occurs when all vcpu are gone). * destroy (which only occurs when all vcpu are gone).
*/ */
mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, ULONG_MAX) { for_ppage_node_in_range(kvm, 0, ULONG_MAX, ppage, tmp) {
struct kvm_pinned_page *ppage = entry;
if (ppage == KVM_DUMMY_PPAGE)
continue;
__clean_dcache_guest_page(page_address(ppage->page), PAGE_SIZE); __clean_dcache_guest_page(page_address(ppage->page), PAGE_SIZE);
cond_resched_rwlock_write(&kvm->mmu_lock); cond_resched_rwlock_write(&kvm->mmu_lock);
} }
@@ -1014,7 +1018,6 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
mt_init_flags(&kvm->arch.pkvm.pinned_pages, MT_FLAGS_USE_RCU);
mmu->arch = &kvm->arch; mmu->arch = &kvm->arch;
if (is_protected_kvm_enabled()) if (is_protected_kvm_enabled())
@@ -1293,18 +1296,13 @@ static int __pkvm_wrprotect_call(u64 pfn, u64 gfn, u8 order, void *args)
static int pkvm_wp_range(struct kvm *kvm, u64 start, u64 end) static int pkvm_wp_range(struct kvm *kvm, u64 start, u64 end)
{ {
unsigned long index = start; struct kvm_pinned_page *ppage, *tmp;
void *entry;
mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, end - 1) { for_ppage_node_in_range(kvm, start, end, ppage, tmp) {
struct kvm_pinned_page *ppage = entry;
int ret; int ret;
if (ppage == KVM_DUMMY_PPAGE)
continue;
ret = pkvm_call_hyp_nvhe_ppage(ppage, __pkvm_wrprotect_call, ret = pkvm_call_hyp_nvhe_ppage(ppage, __pkvm_wrprotect_call,
kvm, false); kvm, false);
if (ret) if (ret)
return ret; return ret;
} }
@@ -1630,28 +1628,9 @@ static int pkvm_host_map_guest(u64 pfn, u64 gfn, u64 nr_pages,
return (ret == -EPERM) ? -EAGAIN : ret; return (ret == -EPERM) ? -EAGAIN : ret;
} }
static struct kvm_pinned_page *
find_ppage_or_above(struct kvm *kvm, phys_addr_t ipa)
{
unsigned long index = ipa;
void *entry;
mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, ULONG_MAX) {
if (entry == KVM_DUMMY_PPAGE)
continue;
return entry;
}
return NULL;
}
static struct kvm_pinned_page *find_ppage(struct kvm *kvm, u64 ipa) static struct kvm_pinned_page *find_ppage(struct kvm *kvm, u64 ipa)
{ {
struct kvm_pinned_page *ppage; return kvm_pinned_pages_iter_first(&kvm->arch.pkvm.pinned_pages, ipa, ipa + PAGE_SIZE - 1);
unsigned long index = ipa;
ppage = mt_find(&kvm->arch.pkvm.pinned_pages, &index, ipa + PAGE_SIZE - 1);
return ppage == KVM_DUMMY_PPAGE ? NULL : ppage;
} }
static int __pkvm_relax_perms_call(u64 pfn, u64 gfn, u8 order, void *args) static int __pkvm_relax_perms_call(u64 pfn, u64 gfn, u8 order, void *args)
@@ -1707,11 +1686,10 @@ static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t *fault_ipa,
{ {
unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE; unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE;
struct kvm_hyp_memcache *hyp_memcache = &vcpu->arch.stage2_mc; struct kvm_hyp_memcache *hyp_memcache = &vcpu->arch.stage2_mc;
unsigned long index, pmd_offset, page_size, end; unsigned long page_size = PAGE_SIZE;
struct mm_struct *mm = current->mm; struct mm_struct *mm = current->mm;
struct kvm_pinned_page *ppage; struct kvm_pinned_page *ppage;
struct kvm *kvm = vcpu->kvm; struct kvm *kvm = vcpu->kvm;
struct maple_tree *mt = &kvm->arch.pkvm.pinned_pages;
int ret, nr_pages; int ret, nr_pages;
struct page *page; struct page *page;
u64 pfn; u64 pfn;
@@ -1760,66 +1738,49 @@ static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t *fault_ipa,
} }
pfn = page_to_pfn(page); pfn = page_to_pfn(page);
pmd_offset = *fault_ipa & (PMD_SIZE - 1);
page_size = transparent_hugepage_adjust(kvm, memslot,
hva, &pfn,
fault_ipa);
page = pfn_to_page(pfn);
retry: read_lock(&kvm->mmu_lock);
if (size) if (!kvm_pinned_pages_iter_first(&vcpu->kvm->arch.pkvm.pinned_pages,
*size = page_size; ALIGN_DOWN(*fault_ipa, PMD_SIZE),
ALIGN(*fault_ipa + 1, PMD_SIZE) - 1))
page_size = transparent_hugepage_adjust(kvm, memslot, hva, &pfn, fault_ipa);
/*
* We take the risk of racing with another vCPU, but sync will be restored by the
* host_map_guest HVC
*/
read_unlock(&kvm->mmu_lock);
page = pfn_to_page(pfn);
ret = account_locked_vm(mm, page_size >> PAGE_SHIFT, true); ret = account_locked_vm(mm, page_size >> PAGE_SHIFT, true);
if (ret) if (ret)
goto unpin; goto unpin;
index = *fault_ipa;
end = index + page_size - 1;
ppage->page = page; ppage->page = page;
ppage->ipa = *fault_ipa; ppage->ipa = *fault_ipa;
ppage->order = get_order(page_size); ppage->order = get_order(page_size);
ppage->pins = 1 << ppage->order; ppage->pins = 1 << ppage->order;
/*
* If we already have a mapping in the middle of the THP, we have no
* other choice than enforcing PAGE_SIZE for pkvm_host_map_guest() to
* succeed.
*/
if (page_size > PAGE_SIZE && mt_find(mt, &index, end)) {
*fault_ipa += pmd_offset;
pfn += pmd_offset >> PAGE_SHIFT;
page = pfn_to_page(pfn);
account_locked_vm(mm, page_size >> PAGE_SHIFT, false);
page_size = PAGE_SIZE;
goto retry;
}
/* Reserve space in the mtree */
ret = mtree_insert_range(mt, index, end, KVM_DUMMY_PPAGE, GFP_KERNEL);
if (ret) {
if (ret == -EEXIST)
ret = 0;
goto dec_account;
}
write_lock(&kvm->mmu_lock); write_lock(&kvm->mmu_lock);
ret = pkvm_host_map_guest(pfn, *fault_ipa >> PAGE_SHIFT, ret = pkvm_host_map_guest(pfn, *fault_ipa >> PAGE_SHIFT,
page_size >> PAGE_SHIFT, KVM_PGTABLE_PROT_R); page_size >> PAGE_SHIFT, KVM_PGTABLE_PROT_R);
if (ret) { if (ret) {
if (WARN_ON(ret == -EAGAIN)) if (ret == -EAGAIN)
ret = 0; ret = 0;
goto err_unlock; goto err_unlock;
} }
WARN_ON(mtree_store_range(mt, index, end, ppage, GFP_ATOMIC)); kvm_pinned_pages_insert(ppage, &kvm->arch.pkvm.pinned_pages);
write_unlock(&kvm->mmu_lock); write_unlock(&kvm->mmu_lock);
if (size)
*size = page_size;
return 0; return 0;
err_unlock: err_unlock:
write_unlock(&kvm->mmu_lock); write_unlock(&kvm->mmu_lock);
dec_account:
account_locked_vm(mm, page_size >> PAGE_SHIFT, false); account_locked_vm(mm, page_size >> PAGE_SHIFT, false);
unpin: unpin:
unpin_user_pages(&page, 1); unpin_user_pages(&page, 1);
@@ -1847,13 +1808,13 @@ int pkvm_mem_abort_range(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, size_t si
idx = srcu_read_lock(&vcpu->kvm->srcu); idx = srcu_read_lock(&vcpu->kvm->srcu);
read_lock(&vcpu->kvm->mmu_lock); read_lock(&vcpu->kvm->mmu_lock);
ppage = find_ppage_or_above(vcpu->kvm, fault_ipa); ppage = kvm_pinned_pages_iter_first(&vcpu->kvm->arch.pkvm.pinned_pages,
fault_ipa, ipa_end);
while (fault_ipa < ipa_end) { while (fault_ipa < ipa_end) {
if (ppage && ppage != KVM_DUMMY_PPAGE && ppage->ipa == fault_ipa) { if (ppage && ppage->ipa == fault_ipa) {
page_size = PAGE_SIZE << ppage->order; page_size = PAGE_SIZE << ppage->order;
ppage = mt_next(&vcpu->kvm->arch.pkvm.pinned_pages, ppage = kvm_pinned_pages_iter_next(ppage, fault_ipa, ipa_end);
ppage->ipa, ULONG_MAX);
} else { } else {
gfn_t gfn = gpa_to_gfn(fault_ipa); gfn_t gfn = gpa_to_gfn(fault_ipa);
struct kvm_memory_slot *memslot; struct kvm_memory_slot *memslot;
@@ -1877,7 +1838,8 @@ int pkvm_mem_abort_range(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, size_t si
* We had to release the mmu_lock so let's update the * We had to release the mmu_lock so let's update the
* reference. * reference.
*/ */
ppage = find_ppage_or_above(vcpu->kvm, fault_ipa + page_size); ppage = kvm_pinned_pages_iter_first(&vcpu->kvm->arch.pkvm.pinned_pages,
fault_ipa + PAGE_SIZE, ipa_end);
} }
fault_ipa += page_size; fault_ipa += page_size;
@@ -1889,6 +1851,162 @@ end:
return err; return err;
} }
static int __pkvm_pin_user_pages(struct kvm *kvm, struct kvm_memory_slot *memslot,
u64 gfn, u64 nr_pages, struct page ***__pages)
{
unsigned long hva = gfn_to_hva_memslot_prot(memslot, gfn, NULL);
unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE;
struct mm_struct *mm = current->mm;
struct page **pages;
long ret;
int p;
pages = kmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL);
if (!pages)
return -ENOMEM;
mmap_read_lock(mm);
ret = pin_user_pages(hva, nr_pages, flags, pages);
mmap_read_unlock(mm);
if (ret == -EHWPOISON) {
kvm_send_hwpoison_signal(hva, PAGE_SHIFT);
goto err_free_pages;
} else if (ret == -EFAULT) {
/* Will try MMIO map */
ret = -EREMOTEIO;
goto err_free_pages;
} else if (ret < 0) {
ret = -EFAULT;
goto err_free_pages;
} else if (ret != nr_pages) {
nr_pages = ret;
ret = -EFAULT;
goto err_unpin_pages;
}
/* See PageSwapBacked() in pkvm_mem_abort() */
for (p = 0; p < nr_pages; p++) {
if (!folio_test_swapbacked(page_folio(pages[p]))) {
ret = -EIO;
goto err_unpin_pages;
}
}
*__pages = pages;
return 0;
err_unpin_pages:
unpin_user_pages(pages, nr_pages);
err_free_pages:
kfree(pages);
return ret;
}
/*
* Splitting is only expected on the back of a relinquish guest HVC in the pKVM case, while
* pkvm_pgtable_stage2_split() can be called with dirty logging.
*/
int __pkvm_pgtable_stage2_split(struct kvm_vcpu *vcpu, phys_addr_t ipa, size_t size)
{
struct list_head ppage_prealloc = LIST_HEAD_INIT(ppage_prealloc);
struct kvm_hyp_memcache *hyp_memcache = &vcpu->arch.stage2_mc;
struct kvm_pinned_page *ppage, *tmp;
struct kvm_memory_slot *memslot;
struct kvm *kvm = vcpu->kvm;
int idx, p, ret, nr_pages;
struct page **pages;
kvm_pfn_t pfn;
gfn_t gfn;
if (!IS_ALIGNED(ipa, PMD_SIZE) || size != PMD_SIZE)
return -EINVAL;
if (!hyp_memcache->nr_pages) {
ret = topup_hyp_memcache(hyp_memcache, 1, 0);
if (ret)
return -ENOMEM;
atomic64_add(PAGE_SIZE, &kvm->stat.protected_hyp_mem);
atomic64_add(PAGE_SIZE, &kvm->stat.protected_pgtable_mem);
}
/* We already have 1 pin on the Huge Page */
nr_pages = (size >> PAGE_SHIFT) - 1;
gfn = (ipa >> PAGE_SHIFT) + 1;
/* Pre-allocate kvm_pinned_page before acquiring the mmu_lock */
for (p = 0; p < nr_pages; p++) {
ppage = kzalloc(sizeof(*ppage), GFP_KERNEL_ACCOUNT);
if (!ppage) {
ret = -ENOMEM;
goto free_pinned_pages;
}
list_add(&ppage->list_node, &ppage_prealloc);
}
idx = srcu_read_lock(&vcpu->kvm->srcu);
memslot = gfn_to_memslot(vcpu->kvm, gfn);
ret = __pkvm_pin_user_pages(kvm, memslot, gfn, nr_pages, &pages);
if (ret)
goto unlock_srcu;
write_lock(&kvm->mmu_lock);
ppage = find_ppage(kvm, ipa);
if (!ppage) {
ret = -EPERM;
goto end;
} else if (!ppage->order) {
ret = 0;
goto end;
}
ret = kvm_call_hyp_nvhe(__pkvm_host_split_guest, page_to_pfn(ppage->page),
ipa >> PAGE_SHIFT, size);
if (ret)
goto end;
ppage->order = 0;
ppage->pins = 1;
pfn = page_to_pfn(ppage->page) + 1;
ipa = ipa + PAGE_SIZE;
while (nr_pages--) {
/* Pop a ppage from the pre-allocated list */
ppage = list_first_entry(&ppage_prealloc, struct kvm_pinned_page, list_node);
list_del_init(&ppage->list_node);
ppage->page = pfn_to_page(pfn);
ppage->ipa = ipa;
ppage->order = 0;
ppage->pins = 1;
kvm_pinned_pages_insert(ppage, &kvm->arch.pkvm.pinned_pages);
pfn += 1;
ipa += PAGE_SIZE;
}
end:
write_unlock(&kvm->mmu_lock);
if (ret)
unpin_user_pages(pages, nr_pages);
kfree(pages);
unlock_srcu:
srcu_read_unlock(&vcpu->kvm->srcu, idx);
free_pinned_pages:
/* Free unused pre-allocated kvm_pinned_page */
list_for_each_entry_safe(ppage, tmp, &ppage_prealloc, list_node) {
list_del(&ppage->list_node);
kfree(ppage);
}
return ret;
}
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
struct kvm_memory_slot *memslot, unsigned long hva, struct kvm_memory_slot *memslot, unsigned long hva,
unsigned long fault_status) unsigned long fault_status)

View File

@@ -319,21 +319,17 @@ static int __reclaim_dying_guest_page_call(u64 pfn, u64 gfn, u8 order, void *arg
static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm) static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm)
{ {
struct kvm_pinned_page *tmp, *ppage;
struct mm_struct *mm = current->mm; struct mm_struct *mm = current->mm;
struct kvm_pinned_page *ppage;
struct kvm_vcpu *host_vcpu; struct kvm_vcpu *host_vcpu;
unsigned long idx, ipa = 0; unsigned long idx;
if (!host_kvm->arch.pkvm.handle) if (!host_kvm->arch.pkvm.handle)
goto out_free; goto out_free;
WARN_ON(kvm_call_hyp_nvhe(__pkvm_start_teardown_vm, host_kvm->arch.pkvm.handle)); WARN_ON(kvm_call_hyp_nvhe(__pkvm_start_teardown_vm, host_kvm->arch.pkvm.handle));
mt_clear_in_rcu(&host_kvm->arch.pkvm.pinned_pages); for_ppage_node_in_range(host_kvm, 0, ULONG_MAX, ppage, tmp) {
mt_for_each(&host_kvm->arch.pkvm.pinned_pages, ppage, ipa, ULONG_MAX) {
if (WARN_ON(ppage == KVM_DUMMY_PPAGE))
continue;
WARN_ON(pkvm_call_hyp_nvhe_ppage(ppage, WARN_ON(pkvm_call_hyp_nvhe_ppage(ppage,
__reclaim_dying_guest_page_call, __reclaim_dying_guest_page_call,
host_kvm, true)); host_kvm, true));
@@ -341,9 +337,9 @@ static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm)
account_locked_vm(mm, 1, false); account_locked_vm(mm, 1, false);
unpin_user_pages_dirty_lock(&ppage->page, 1, host_kvm->arch.pkvm.enabled); unpin_user_pages_dirty_lock(&ppage->page, 1, host_kvm->arch.pkvm.enabled);
kvm_pinned_pages_remove(ppage, &host_kvm->arch.pkvm.pinned_pages);
kfree(ppage); kfree(ppage);
} }
mtree_destroy(&host_kvm->arch.pkvm.pinned_pages);
WARN_ON(kvm_call_hyp_nvhe(__pkvm_finalize_teardown_vm, host_kvm->arch.pkvm.handle)); WARN_ON(kvm_call_hyp_nvhe(__pkvm_finalize_teardown_vm, host_kvm->arch.pkvm.handle));
@@ -538,21 +534,21 @@ void pkvm_host_reclaim_page(struct kvm *host_kvm, phys_addr_t ipa)
{ {
struct mm_struct *mm = current->mm; struct mm_struct *mm = current->mm;
struct kvm_pinned_page *ppage; struct kvm_pinned_page *ppage;
unsigned long index = ipa;
u16 pins; u16 pins;
write_lock(&host_kvm->mmu_lock); write_lock(&host_kvm->mmu_lock);
ppage = mt_find(&host_kvm->arch.pkvm.pinned_pages, &index, ppage = kvm_pinned_pages_iter_first(&host_kvm->arch.pkvm.pinned_pages,
index + PAGE_SIZE - 1); ipa, ipa + PAGE_SIZE - 1);
if (ppage && ppage != KVM_DUMMY_PPAGE) { if (ppage) {
WARN_ON_ONCE(ppage->pins != 1);
if (ppage->pins) if (ppage->pins)
ppage->pins--; ppage->pins--;
else
WARN_ON(1);
pins = ppage->pins; pins = ppage->pins;
if (!pins) if (!pins)
mtree_erase(&host_kvm->arch.pkvm.pinned_pages, ipa); kvm_pinned_pages_remove(ppage,
&host_kvm->arch.pkvm.pinned_pages);
} }
write_unlock(&host_kvm->mmu_lock); write_unlock(&host_kvm->mmu_lock);

View File

@@ -672,6 +672,7 @@ CONFIG_CRYPTO_ZSTD=y
CONFIG_CRYPTO_ANSI_CPRNG=y CONFIG_CRYPTO_ANSI_CPRNG=y
CONFIG_CRYPTO_AES_NI_INTEL=y CONFIG_CRYPTO_AES_NI_INTEL=y
CONFIG_CRYPTO_POLYVAL_CLMUL_NI=y CONFIG_CRYPTO_POLYVAL_CLMUL_NI=y
CONFIG_CRYPTO_SHA1_SSSE3=y
CONFIG_CRYPTO_SHA256_SSSE3=y CONFIG_CRYPTO_SHA256_SSSE3=y
CONFIG_CRYPTO_SHA512_SSSE3=y CONFIG_CRYPTO_SHA512_SSSE3=y
CONFIG_CRC_CCITT=y CONFIG_CRC_CCITT=y

View File

@@ -14,12 +14,6 @@ CONFIG_UCLAMP_TASK=y
CONFIG_UCLAMP_BUCKETS_COUNT=20 CONFIG_UCLAMP_BUCKETS_COUNT=20
CONFIG_CGROUPS=y CONFIG_CGROUPS=y
CONFIG_MEMCG=y CONFIG_MEMCG=y
CONFIG_BLK_CGROUP=y
CONFIG_CGROUP_SCHED=y
CONFIG_UCLAMP_TASK_GROUP=y
CONFIG_CGROUP_FREEZER=y
CONFIG_CPUSETS=y
CONFIG_CGROUP_CPUACCT=y
# CONFIG_RD_BZIP2 is not set # CONFIG_RD_BZIP2 is not set
# CONFIG_RD_LZMA is not set # CONFIG_RD_LZMA is not set
# CONFIG_RD_XZ is not set # CONFIG_RD_XZ is not set
@@ -47,7 +41,6 @@ CONFIG_CPU_FREQ_GOV_POWERSAVE=y
CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y
CONFIG_JUMP_LABEL=y CONFIG_JUMP_LABEL=y
# CONFIG_BLOCK_LEGACY_AUTOLOAD is not set # CONFIG_BLOCK_LEGACY_AUTOLOAD is not set
CONFIG_BLK_CGROUP_IOCOST=y
CONFIG_PARTITION_ADVANCED=y CONFIG_PARTITION_ADVANCED=y
# CONFIG_MSDOS_PARTITION is not set # CONFIG_MSDOS_PARTITION is not set
# CONFIG_MQ_IOSCHED_DEADLINE is not set # CONFIG_MQ_IOSCHED_DEADLINE is not set
@@ -209,6 +202,7 @@ CONFIG_CRYPTO_HCTR2=y
CONFIG_CRYPTO_LZO=y CONFIG_CRYPTO_LZO=y
CONFIG_CRYPTO_AES_NI_INTEL=y CONFIG_CRYPTO_AES_NI_INTEL=y
CONFIG_CRYPTO_POLYVAL_CLMUL_NI=y CONFIG_CRYPTO_POLYVAL_CLMUL_NI=y
CONFIG_CRYPTO_SHA1_SSSE3=y
CONFIG_CRYPTO_SHA256_SSSE3=y CONFIG_CRYPTO_SHA256_SSSE3=y
CONFIG_CRYPTO_SHA512_SSSE3=y CONFIG_CRYPTO_SHA512_SSSE3=y
CONFIG_PRINTK_TIME=y CONFIG_PRINTK_TIME=y

View File

@@ -6645,7 +6645,7 @@ static void print_binder_work_ilocked(struct seq_file *m,
struct binder_proc *proc, struct binder_proc *proc,
const char *prefix, const char *prefix,
const char *transaction_prefix, const char *transaction_prefix,
struct binder_work *w) struct binder_work *w, bool hash_ptrs)
{ {
struct binder_node *node; struct binder_node *node;
struct binder_transaction *t; struct binder_transaction *t;
@@ -6668,6 +6668,12 @@ static void print_binder_work_ilocked(struct seq_file *m,
break; break;
case BINDER_WORK_NODE: case BINDER_WORK_NODE:
node = container_of(w, struct binder_node, work); node = container_of(w, struct binder_node, work);
if (hash_ptrs)
seq_printf(m, "%snode work %d: u%p c%p\n",
prefix, node->debug_id,
(void *)(long)node->ptr,
(void *)(long)node->cookie);
else
seq_printf(m, "%snode work %d: u%016llx c%016llx\n", seq_printf(m, "%snode work %d: u%016llx c%016llx\n",
prefix, node->debug_id, prefix, node->debug_id,
(u64)node->ptr, (u64)node->cookie); (u64)node->ptr, (u64)node->cookie);
@@ -6695,7 +6701,7 @@ static void print_binder_work_ilocked(struct seq_file *m,
static void print_binder_thread_ilocked(struct seq_file *m, static void print_binder_thread_ilocked(struct seq_file *m,
struct binder_thread *thread, struct binder_thread *thread,
int print_always) bool print_always, bool hash_ptrs)
{ {
struct binder_transaction *t; struct binder_transaction *t;
struct binder_work *w; struct binder_work *w;
@@ -6725,14 +6731,16 @@ static void print_binder_thread_ilocked(struct seq_file *m,
} }
list_for_each_entry(w, &thread->todo, entry) { list_for_each_entry(w, &thread->todo, entry) {
print_binder_work_ilocked(m, thread->proc, " ", print_binder_work_ilocked(m, thread->proc, " ",
" pending transaction", w); " pending transaction",
w, hash_ptrs);
} }
if (!print_always && m->count == header_pos) if (!print_always && m->count == header_pos)
m->count = start_pos; m->count = start_pos;
} }
static void print_binder_node_nilocked(struct seq_file *m, static void print_binder_node_nilocked(struct seq_file *m,
struct binder_node *node) struct binder_node *node,
bool hash_ptrs)
{ {
struct binder_ref *ref; struct binder_ref *ref;
struct binder_work *w; struct binder_work *w;
@@ -6742,8 +6750,13 @@ static void print_binder_node_nilocked(struct seq_file *m,
hlist_for_each_entry(ref, &node->refs, node_entry) hlist_for_each_entry(ref, &node->refs, node_entry)
count++; count++;
seq_printf(m, " node %d: u%016llx c%016llx pri %d:%d hs %d hw %d ls %d lw %d is %d iw %d tr %d", if (hash_ptrs)
node->debug_id, (u64)node->ptr, (u64)node->cookie, seq_printf(m, " node %d: u%p c%p", node->debug_id,
(void *)(long)node->ptr, (void *)(long)node->cookie);
else
seq_printf(m, " node %d: u%016llx c%016llx", node->debug_id,
(u64)node->ptr, (u64)node->cookie);
seq_printf(m, " pri %d:%d hs %d hw %d ls %d lw %d is %d iw %d tr %d",
node->sched_policy, node->min_priority, node->sched_policy, node->min_priority,
node->has_strong_ref, node->has_weak_ref, node->has_strong_ref, node->has_weak_ref,
node->local_strong_refs, node->local_weak_refs, node->local_strong_refs, node->local_weak_refs,
@@ -6757,7 +6770,8 @@ static void print_binder_node_nilocked(struct seq_file *m,
if (node->proc) { if (node->proc) {
list_for_each_entry(w, &node->async_todo, entry) list_for_each_entry(w, &node->async_todo, entry)
print_binder_work_ilocked(m, node->proc, " ", print_binder_work_ilocked(m, node->proc, " ",
" pending async transaction", w); " pending async transaction",
w, hash_ptrs);
} }
} }
@@ -6773,8 +6787,54 @@ static void print_binder_ref_olocked(struct seq_file *m,
binder_node_unlock(ref->node); binder_node_unlock(ref->node);
} }
static void print_binder_proc(struct seq_file *m, /**
struct binder_proc *proc, int print_all) * print_next_binder_node_ilocked() - Print binder_node from a locked list
* @m: struct seq_file for output via seq_printf()
* @proc: struct binder_proc we hold the inner_proc_lock to (if any)
* @node: struct binder_node to print fields of
* @prev_node: struct binder_node we hold a temporary reference to (if any)
* @hash_ptrs: whether to hash @node's binder_uintptr_t fields
*
* Helper function to handle synchronization around printing a struct
* binder_node while iterating through @proc->nodes or the dead nodes list.
* Caller must hold either @proc->inner_lock (for live nodes) or
* binder_dead_nodes_lock. This lock will be released during the body of this
* function, but it will be reacquired before returning to the caller.
*
* Return: pointer to the struct binder_node we hold a tmpref on
*/
static struct binder_node *
print_next_binder_node_ilocked(struct seq_file *m, struct binder_proc *proc,
struct binder_node *node,
struct binder_node *prev_node, bool hash_ptrs)
{
/*
* Take a temporary reference on the node so that isn't freed while
* we print it.
*/
binder_inc_node_tmpref_ilocked(node);
/*
* Live nodes need to drop the inner proc lock and dead nodes need to
* drop the binder_dead_nodes_lock before trying to take the node lock.
*/
if (proc)
binder_inner_proc_unlock(proc);
else
spin_unlock(&binder_dead_nodes_lock);
if (prev_node)
binder_put_node(prev_node);
binder_node_inner_lock(node);
print_binder_node_nilocked(m, node, hash_ptrs);
binder_node_inner_unlock(node);
if (proc)
binder_inner_proc_lock(proc);
else
spin_lock(&binder_dead_nodes_lock);
return node;
}
static void print_binder_proc(struct seq_file *m, struct binder_proc *proc,
bool print_all, bool hash_ptrs)
{ {
struct binder_work *w; struct binder_work *w;
struct rb_node *n; struct rb_node *n;
@@ -6787,31 +6847,19 @@ static void print_binder_proc(struct seq_file *m,
header_pos = m->count; header_pos = m->count;
binder_inner_proc_lock(proc); binder_inner_proc_lock(proc);
for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) for (n = rb_first(&proc->threads); n; n = rb_next(n))
print_binder_thread_ilocked(m, rb_entry(n, struct binder_thread, print_binder_thread_ilocked(m, rb_entry(n, struct binder_thread,
rb_node), print_all); rb_node), print_all, hash_ptrs);
for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) { for (n = rb_first(&proc->nodes); n; n = rb_next(n)) {
struct binder_node *node = rb_entry(n, struct binder_node, struct binder_node *node = rb_entry(n, struct binder_node,
rb_node); rb_node);
if (!print_all && !node->has_async_transaction) if (!print_all && !node->has_async_transaction)
continue; continue;
/* last_node = print_next_binder_node_ilocked(m, proc, node,
* take a temporary reference on the node so it last_node,
* survives and isn't removed from the tree hash_ptrs);
* while we print it.
*/
binder_inc_node_tmpref_ilocked(node);
/* Need to drop inner lock to take node lock */
binder_inner_proc_unlock(proc);
if (last_node)
binder_put_node(last_node);
binder_node_inner_lock(node);
print_binder_node_nilocked(m, node);
binder_node_inner_unlock(node);
last_node = node;
binder_inner_proc_lock(proc);
} }
binder_inner_proc_unlock(proc); binder_inner_proc_unlock(proc);
if (last_node) if (last_node)
@@ -6819,9 +6867,7 @@ static void print_binder_proc(struct seq_file *m,
if (print_all) { if (print_all) {
binder_proc_lock(proc); binder_proc_lock(proc);
for (n = rb_first(&proc->refs_by_desc); for (n = rb_first(&proc->refs_by_desc); n; n = rb_next(n))
n != NULL;
n = rb_next(n))
print_binder_ref_olocked(m, rb_entry(n, print_binder_ref_olocked(m, rb_entry(n,
struct binder_ref, struct binder_ref,
rb_node_desc)); rb_node_desc));
@@ -6831,7 +6877,8 @@ static void print_binder_proc(struct seq_file *m,
binder_inner_proc_lock(proc); binder_inner_proc_lock(proc);
list_for_each_entry(w, &proc->todo, entry) list_for_each_entry(w, &proc->todo, entry)
print_binder_work_ilocked(m, proc, " ", print_binder_work_ilocked(m, proc, " ",
" pending transaction", w); " pending transaction", w,
hash_ptrs);
list_for_each_entry(w, &proc->delivered_death, entry) { list_for_each_entry(w, &proc->delivered_death, entry) {
seq_puts(m, " has delivered dead binder\n"); seq_puts(m, " has delivered dead binder\n");
break; break;
@@ -6958,7 +7005,7 @@ static void print_binder_proc_stats(struct seq_file *m,
count = 0; count = 0;
ready_threads = 0; ready_threads = 0;
binder_inner_proc_lock(proc); binder_inner_proc_lock(proc);
for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) for (n = rb_first(&proc->threads); n; n = rb_next(n))
count++; count++;
list_for_each_entry(thread, &proc->waiting_threads, waiting_thread_node) list_for_each_entry(thread, &proc->waiting_threads, waiting_thread_node)
@@ -6972,7 +7019,7 @@ static void print_binder_proc_stats(struct seq_file *m,
ready_threads, ready_threads,
free_async_space); free_async_space);
count = 0; count = 0;
for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) for (n = rb_first(&proc->nodes); n; n = rb_next(n))
count++; count++;
binder_inner_proc_unlock(proc); binder_inner_proc_unlock(proc);
seq_printf(m, " nodes: %d\n", count); seq_printf(m, " nodes: %d\n", count);
@@ -6980,7 +7027,7 @@ static void print_binder_proc_stats(struct seq_file *m,
strong = 0; strong = 0;
weak = 0; weak = 0;
binder_proc_lock(proc); binder_proc_lock(proc);
for (n = rb_first(&proc->refs_by_desc); n != NULL; n = rb_next(n)) { for (n = rb_first(&proc->refs_by_desc); n; n = rb_next(n)) {
struct binder_ref *ref = rb_entry(n, struct binder_ref, struct binder_ref *ref = rb_entry(n, struct binder_ref,
rb_node_desc); rb_node_desc);
count++; count++;
@@ -7007,7 +7054,7 @@ static void print_binder_proc_stats(struct seq_file *m,
print_binder_stats(m, " ", &proc->stats); print_binder_stats(m, " ", &proc->stats);
} }
static int state_show(struct seq_file *m, void *unused) static void print_binder_state(struct seq_file *m, bool hash_ptrs)
{ {
struct binder_proc *proc; struct binder_proc *proc;
struct binder_node *node; struct binder_node *node;
@@ -7018,31 +7065,40 @@ static int state_show(struct seq_file *m, void *unused)
spin_lock(&binder_dead_nodes_lock); spin_lock(&binder_dead_nodes_lock);
if (!hlist_empty(&binder_dead_nodes)) if (!hlist_empty(&binder_dead_nodes))
seq_puts(m, "dead nodes:\n"); seq_puts(m, "dead nodes:\n");
hlist_for_each_entry(node, &binder_dead_nodes, dead_node) { hlist_for_each_entry(node, &binder_dead_nodes, dead_node)
/* last_node = print_next_binder_node_ilocked(m, NULL, node,
* take a temporary reference on the node so it last_node,
* survives and isn't removed from the list hash_ptrs);
* while we print it.
*/
node->tmp_refs++;
spin_unlock(&binder_dead_nodes_lock);
if (last_node)
binder_put_node(last_node);
binder_node_lock(node);
print_binder_node_nilocked(m, node);
binder_node_unlock(node);
last_node = node;
spin_lock(&binder_dead_nodes_lock);
}
spin_unlock(&binder_dead_nodes_lock); spin_unlock(&binder_dead_nodes_lock);
if (last_node) if (last_node)
binder_put_node(last_node); binder_put_node(last_node);
mutex_lock(&binder_procs_lock); mutex_lock(&binder_procs_lock);
hlist_for_each_entry(proc, &binder_procs, proc_node) hlist_for_each_entry(proc, &binder_procs, proc_node)
print_binder_proc(m, proc, 1); print_binder_proc(m, proc, true, hash_ptrs);
mutex_unlock(&binder_procs_lock); mutex_unlock(&binder_procs_lock);
}
static void print_binder_transactions(struct seq_file *m, bool hash_ptrs)
{
struct binder_proc *proc;
seq_puts(m, "binder transactions:\n");
mutex_lock(&binder_procs_lock);
hlist_for_each_entry(proc, &binder_procs, proc_node)
print_binder_proc(m, proc, false, hash_ptrs);
mutex_unlock(&binder_procs_lock);
}
static int state_show(struct seq_file *m, void *unused)
{
print_binder_state(m, false);
return 0;
}
static int state_hashed_show(struct seq_file *m, void *unused)
{
print_binder_state(m, true);
return 0; return 0;
} }
@@ -7064,14 +7120,13 @@ static int stats_show(struct seq_file *m, void *unused)
static int transactions_show(struct seq_file *m, void *unused) static int transactions_show(struct seq_file *m, void *unused)
{ {
struct binder_proc *proc; print_binder_transactions(m, false);
return 0;
seq_puts(m, "binder transactions:\n"); }
mutex_lock(&binder_procs_lock);
hlist_for_each_entry(proc, &binder_procs, proc_node)
print_binder_proc(m, proc, 0);
mutex_unlock(&binder_procs_lock);
static int transactions_hashed_show(struct seq_file *m, void *unused)
{
print_binder_transactions(m, true);
return 0; return 0;
} }
@@ -7084,7 +7139,7 @@ static int proc_show(struct seq_file *m, void *unused)
hlist_for_each_entry(itr, &binder_procs, proc_node) { hlist_for_each_entry(itr, &binder_procs, proc_node) {
if (itr->pid == pid) { if (itr->pid == pid) {
seq_puts(m, "binder proc state:\n"); seq_puts(m, "binder proc state:\n");
print_binder_proc(m, itr, 1); print_binder_proc(m, itr, true, false);
} }
} }
mutex_unlock(&binder_procs_lock); mutex_unlock(&binder_procs_lock);
@@ -7151,8 +7206,10 @@ const struct file_operations binder_fops = {
}; };
DEFINE_SHOW_ATTRIBUTE(state); DEFINE_SHOW_ATTRIBUTE(state);
DEFINE_SHOW_ATTRIBUTE(state_hashed);
DEFINE_SHOW_ATTRIBUTE(stats); DEFINE_SHOW_ATTRIBUTE(stats);
DEFINE_SHOW_ATTRIBUTE(transactions); DEFINE_SHOW_ATTRIBUTE(transactions);
DEFINE_SHOW_ATTRIBUTE(transactions_hashed);
DEFINE_SHOW_ATTRIBUTE(transaction_log); DEFINE_SHOW_ATTRIBUTE(transaction_log);
const struct binder_debugfs_entry binder_debugfs_entries[] = { const struct binder_debugfs_entry binder_debugfs_entries[] = {
@@ -7162,6 +7219,12 @@ const struct binder_debugfs_entry binder_debugfs_entries[] = {
.fops = &state_fops, .fops = &state_fops,
.data = NULL, .data = NULL,
}, },
{
.name = "state_hashed",
.mode = 0444,
.fops = &state_hashed_fops,
.data = NULL,
},
{ {
.name = "stats", .name = "stats",
.mode = 0444, .mode = 0444,
@@ -7174,6 +7237,12 @@ const struct binder_debugfs_entry binder_debugfs_entries[] = {
.fops = &transactions_fops, .fops = &transactions_fops,
.data = NULL, .data = NULL,
}, },
{
.name = "transactions_hashed",
.mode = 0444,
.fops = &transactions_hashed_fops,
.data = NULL,
},
{ {
.name = "transaction_log", .name = "transaction_log",
.mode = 0444, .mode = 0444,

View File

@@ -490,6 +490,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_lruvec_add_folio);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_lruvec_del_folio); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_lruvec_del_folio);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_add_lazyfree_bypass); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_add_lazyfree_bypass);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_do_async_mmap_readahead); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_do_async_mmap_readahead);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mm_free_page);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_update_page_mapcount); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_update_page_mapcount);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_add_page_to_lrulist); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_add_page_to_lrulist);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_del_page_from_lrulist); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_del_page_from_lrulist);
@@ -676,3 +677,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_filemap_fault_pre_folio_locked);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_filemap_folio_mapped); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_filemap_folio_mapped);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_folio_remove_rmap_ptes); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_folio_remove_rmap_ptes);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_pageset_update); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_pageset_update);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_xhci_full_reset_on_remove);
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mempool_alloc_skip_wait);

View File

@@ -1002,7 +1002,7 @@ static enum hrtimer_restart pm_suspend_timer_fn(struct hrtimer *timer)
* If 'expires' is after the current time, we've been called * If 'expires' is after the current time, we've been called
* too early. * too early.
*/ */
if (expires > 0 && expires < ktime_get_mono_fast_ns()) { if (expires > 0 && expires <= ktime_get_mono_fast_ns()) {
dev->power.timer_expires = 0; dev->power.timer_expires = 0;
rpm_suspend(dev, dev->power.timer_autosuspends ? rpm_suspend(dev, dev->power.timer_autosuspends ?
(RPM_ASYNC | RPM_AUTO) : RPM_ASYNC); (RPM_ASYNC | RPM_AUTO) : RPM_ASYNC);

View File

@@ -284,15 +284,13 @@ static int kvm_arm_smmu_domain_finalize(struct kvm_arm_smmu_domain *kvm_smmu_dom
return 0; return 0;
} }
kvm_smmu_domain->smmu = smmu;
if (kvm_smmu_domain->domain.type == IOMMU_DOMAIN_IDENTITY) { if (kvm_smmu_domain->domain.type == IOMMU_DOMAIN_IDENTITY) {
kvm_smmu_domain->id = KVM_IOMMU_DOMAIN_IDMAP_ID; kvm_smmu_domain->id = KVM_IOMMU_DOMAIN_IDMAP_ID;
/* /*
* Identity domains doesn't use the DMA API, so no need to * Identity domains doesn't use the DMA API, so no need to
* set the domain aperture. * set the domain aperture.
*/ */
return 0; goto out;
} }
/* Default to stage-1. */ /* Default to stage-1. */
@@ -325,7 +323,13 @@ static int kvm_arm_smmu_domain_finalize(struct kvm_arm_smmu_domain *kvm_smmu_dom
ret = kvm_call_hyp_nvhe_mc(__pkvm_host_iommu_alloc_domain, ret = kvm_call_hyp_nvhe_mc(__pkvm_host_iommu_alloc_domain,
kvm_smmu_domain->id, kvm_smmu_domain->type); kvm_smmu_domain->id, kvm_smmu_domain->type);
if (ret) {
ida_free(&kvm_arm_smmu_domain_ida, kvm_smmu_domain->id);
return ret;
}
out:
kvm_smmu_domain->smmu = smmu;
return ret; return ret;
} }

View File

@@ -629,7 +629,6 @@ int ufshcd_mcq_abort(struct scsi_cmnd *cmd)
int tag = scsi_cmd_to_rq(cmd)->tag; int tag = scsi_cmd_to_rq(cmd)->tag;
struct ufshcd_lrb *lrbp = &hba->lrb[tag]; struct ufshcd_lrb *lrbp = &hba->lrb[tag];
struct ufs_hw_queue *hwq; struct ufs_hw_queue *hwq;
unsigned long flags;
int err; int err;
/* Skip task abort in case previous aborts failed and report failure */ /* Skip task abort in case previous aborts failed and report failure */
@@ -668,10 +667,5 @@ int ufshcd_mcq_abort(struct scsi_cmnd *cmd)
return FAILED; return FAILED;
} }
spin_lock_irqsave(&hwq->cq_lock, flags);
if (ufshcd_cmd_inflight(lrbp->cmd))
ufshcd_release_scsi_cmd(hba, lrbp);
spin_unlock_irqrestore(&hwq->cq_lock, flags);
return SUCCESS; return SUCCESS;
} }

View File

@@ -6545,9 +6545,14 @@ static void ufshcd_err_handler(struct work_struct *work)
up(&hba->host_sem); up(&hba->host_sem);
return; return;
} }
spin_unlock_irqrestore(hba->host->host_lock, flags);
ufshcd_err_handling_prepare(hba);
spin_lock_irqsave(hba->host->host_lock, flags);
ufshcd_set_eh_in_progress(hba); ufshcd_set_eh_in_progress(hba);
spin_unlock_irqrestore(hba->host->host_lock, flags); spin_unlock_irqrestore(hba->host->host_lock, flags);
ufshcd_err_handling_prepare(hba);
/* Complete requests that have door-bell cleared by h/w */ /* Complete requests that have door-bell cleared by h/w */
ufshcd_complete_requests(hba, false); ufshcd_complete_requests(hba, false);
spin_lock_irqsave(hba->host->host_lock, flags); spin_lock_irqsave(hba->host->host_lock, flags);

View File

@@ -18,6 +18,7 @@
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/dmi.h> #include <linux/dmi.h>
#include <linux/dma-mapping.h> #include <linux/dma-mapping.h>
#include <trace/hooks/usb.h>
#include "xhci.h" #include "xhci.h"
#include "xhci-trace.h" #include "xhci-trace.h"
@@ -196,6 +197,7 @@ int xhci_reset(struct xhci_hcd *xhci, u64 timeout_us)
u32 command; u32 command;
u32 state; u32 state;
int ret; int ret;
bool full_reset = 0;
state = readl(&xhci->op_regs->status); state = readl(&xhci->op_regs->status);
@@ -224,8 +226,11 @@ int xhci_reset(struct xhci_hcd *xhci, u64 timeout_us)
if (xhci->quirks & XHCI_INTEL_HOST) if (xhci->quirks & XHCI_INTEL_HOST)
udelay(1000); udelay(1000);
trace_android_vh_xhci_full_reset_on_remove(&full_reset);
ret = xhci_handshake_check_state(xhci, &xhci->op_regs->command, ret = xhci_handshake_check_state(xhci, &xhci->op_regs->command,
CMD_RESET, 0, timeout_us, XHCI_STATE_REMOVING); CMD_RESET, 0, timeout_us,
full_reset ? 0 : XHCI_STATE_REMOVING);
if (ret) if (ret)
return ret; return ret;

View File

@@ -547,6 +547,14 @@ struct pd_rx_event {
struct pd_message msg; struct pd_message msg;
}; };
struct altmode_vdm_event {
struct kthread_work work;
struct tcpm_port *port;
u32 header;
u32 *data;
int cnt;
};
static const char * const pd_rev[] = { static const char * const pd_rev[] = {
[PD_REV10] = "rev1", [PD_REV10] = "rev1",
[PD_REV20] = "rev2", [PD_REV20] = "rev2",
@@ -1531,12 +1539,64 @@ static void tcpm_queue_vdm(struct tcpm_port *port, const u32 header,
mod_vdm_delayed_work(port, 0); mod_vdm_delayed_work(port, 0);
} }
static void tcpm_queue_vdm_unlocked(struct tcpm_port *port, const u32 header, static void tcpm_queue_vdm_work(struct kthread_work *work)
{
struct altmode_vdm_event *event = container_of(work,
struct altmode_vdm_event,
work);
struct tcpm_port *port = event->port;
mutex_lock(&port->lock);
if (port->state != SRC_READY && port->state != SNK_READY) {
tcpm_log_force(port, "dropping altmode_vdm_event");
goto port_unlock;
}
tcpm_queue_vdm(port, event->header, event->data, event->cnt);
port_unlock:
kfree(event->data);
kfree(event);
mutex_unlock(&port->lock);
}
static int tcpm_queue_vdm_unlocked(struct tcpm_port *port, const u32 header,
const u32 *data, int cnt) const u32 *data, int cnt)
{ {
mutex_lock(&port->lock); struct altmode_vdm_event *event;
tcpm_queue_vdm(port, header, data, cnt); u32 *data_cpy;
mutex_unlock(&port->lock); int ret = -ENOMEM;
event = kzalloc(sizeof(*event), GFP_KERNEL);
if (!event)
goto err_event;
data_cpy = kcalloc(cnt, sizeof(u32), GFP_KERNEL);
if (!data_cpy)
goto err_data;
kthread_init_work(&event->work, tcpm_queue_vdm_work);
event->port = port;
event->header = header;
memcpy(data_cpy, data, sizeof(u32) * cnt);
event->data = data_cpy;
event->cnt = cnt;
ret = kthread_queue_work(port->wq, &event->work);
if (!ret) {
ret = -EBUSY;
goto err_queue;
}
return 0;
err_queue:
kfree(data_cpy);
err_data:
kfree(event);
err_event:
tcpm_log_force(port, "failed to queue altmode vdm, err:%d", ret);
return ret;
} }
static void svdm_consume_identity(struct tcpm_port *port, const u32 *p, int cnt) static void svdm_consume_identity(struct tcpm_port *port, const u32 *p, int cnt)
@@ -2297,8 +2357,7 @@ static int tcpm_altmode_enter(struct typec_altmode *altmode, u32 *vdo)
header = VDO(altmode->svid, vdo ? 2 : 1, svdm_version, CMD_ENTER_MODE); header = VDO(altmode->svid, vdo ? 2 : 1, svdm_version, CMD_ENTER_MODE);
header |= VDO_OPOS(altmode->mode); header |= VDO_OPOS(altmode->mode);
tcpm_queue_vdm_unlocked(port, header, vdo, vdo ? 1 : 0); return tcpm_queue_vdm_unlocked(port, header, vdo, vdo ? 1 : 0);
return 0;
} }
static int tcpm_altmode_exit(struct typec_altmode *altmode) static int tcpm_altmode_exit(struct typec_altmode *altmode)
@@ -2314,8 +2373,7 @@ static int tcpm_altmode_exit(struct typec_altmode *altmode)
header = VDO(altmode->svid, 1, svdm_version, CMD_EXIT_MODE); header = VDO(altmode->svid, 1, svdm_version, CMD_EXIT_MODE);
header |= VDO_OPOS(altmode->mode); header |= VDO_OPOS(altmode->mode);
tcpm_queue_vdm_unlocked(port, header, NULL, 0); return tcpm_queue_vdm_unlocked(port, header, NULL, 0);
return 0;
} }
static int tcpm_altmode_vdm(struct typec_altmode *altmode, static int tcpm_altmode_vdm(struct typec_altmode *altmode,
@@ -2323,9 +2381,7 @@ static int tcpm_altmode_vdm(struct typec_altmode *altmode,
{ {
struct tcpm_port *port = typec_altmode_get_drvdata(altmode); struct tcpm_port *port = typec_altmode_get_drvdata(altmode);
tcpm_queue_vdm_unlocked(port, header, data, count - 1); return tcpm_queue_vdm_unlocked(port, header, data, count - 1);
return 0;
} }
static const struct typec_altmode_ops tcpm_altmode_ops = { static const struct typec_altmode_ops tcpm_altmode_ops = {

View File

@@ -336,6 +336,7 @@ static struct workqueue_struct *z_erofs_workqueue __read_mostly;
#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
static struct kthread_worker __rcu **z_erofs_pcpu_workers; static struct kthread_worker __rcu **z_erofs_pcpu_workers;
static atomic_t erofs_percpu_workers_initialized = ATOMIC_INIT(0);
static void erofs_destroy_percpu_workers(void) static void erofs_destroy_percpu_workers(void)
{ {
@@ -381,12 +382,8 @@ static int erofs_init_percpu_workers(void)
} }
return 0; return 0;
} }
#else
static inline void erofs_destroy_percpu_workers(void) {}
static inline int erofs_init_percpu_workers(void) { return 0; }
#endif
#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_EROFS_FS_PCPU_KTHREAD) #ifdef CONFIG_HOTPLUG_CPU
static DEFINE_SPINLOCK(z_erofs_pcpu_worker_lock); static DEFINE_SPINLOCK(z_erofs_pcpu_worker_lock);
static enum cpuhp_state erofs_cpuhp_state; static enum cpuhp_state erofs_cpuhp_state;
@@ -443,15 +440,53 @@ static void erofs_cpu_hotplug_destroy(void)
if (erofs_cpuhp_state) if (erofs_cpuhp_state)
cpuhp_remove_state_nocalls(erofs_cpuhp_state); cpuhp_remove_state_nocalls(erofs_cpuhp_state);
} }
#else /* !CONFIG_HOTPLUG_CPU || !CONFIG_EROFS_FS_PCPU_KTHREAD */ #else /* !CONFIG_HOTPLUG_CPU */
static inline int erofs_cpu_hotplug_init(void) { return 0; } static inline int erofs_cpu_hotplug_init(void) { return 0; }
static inline void erofs_cpu_hotplug_destroy(void) {} static inline void erofs_cpu_hotplug_destroy(void) {}
#endif #endif/* CONFIG_HOTPLUG_CPU */
static int z_erofs_init_pcpu_workers(struct super_block *sb)
{
int err;
if (atomic_xchg(&erofs_percpu_workers_initialized, 1))
return 0;
err = erofs_init_percpu_workers();
if (err) {
erofs_err(sb, "per-cpu workers: failed to allocate.");
goto err_init_percpu_workers;
}
err = erofs_cpu_hotplug_init();
if (err < 0) {
erofs_err(sb, "per-cpu workers: failed CPU hotplug init.");
goto err_cpuhp_init;
}
erofs_info(sb, "initialized per-cpu workers successfully.");
return err;
err_cpuhp_init:
erofs_destroy_percpu_workers();
err_init_percpu_workers:
atomic_set(&erofs_percpu_workers_initialized, 0);
return err;
}
static void z_erofs_destroy_pcpu_workers(void)
{
if (!atomic_xchg(&erofs_percpu_workers_initialized, 0))
return;
erofs_cpu_hotplug_destroy();
erofs_destroy_percpu_workers();
}
#else /* !CONFIG_EROFS_FS_PCPU_KTHREAD */
static inline int z_erofs_init_pcpu_workers(struct super_block *sb) { return 0; }
static inline void z_erofs_destroy_pcpu_workers(void) {}
#endif/* CONFIG_EROFS_FS_PCPU_KTHREAD */
void z_erofs_exit_zip_subsystem(void) void z_erofs_exit_zip_subsystem(void)
{ {
erofs_cpu_hotplug_destroy(); z_erofs_destroy_pcpu_workers();
erofs_destroy_percpu_workers();
destroy_workqueue(z_erofs_workqueue); destroy_workqueue(z_erofs_workqueue);
z_erofs_destroy_pcluster_pool(); z_erofs_destroy_pcluster_pool();
} }
@@ -467,23 +502,12 @@ int __init z_erofs_init_zip_subsystem(void)
WQ_UNBOUND | WQ_HIGHPRI, num_possible_cpus()); WQ_UNBOUND | WQ_HIGHPRI, num_possible_cpus());
if (!z_erofs_workqueue) { if (!z_erofs_workqueue) {
err = -ENOMEM; err = -ENOMEM;
goto out_error_workqueue_init; goto out_err_workqueue_init;
} }
err = erofs_init_percpu_workers();
if (err)
goto out_error_pcpu_worker;
err = erofs_cpu_hotplug_init();
if (err < 0)
goto out_error_cpuhp_init;
return err; return err;
out_error_cpuhp_init: out_err_workqueue_init:
erofs_destroy_percpu_workers();
out_error_pcpu_worker:
destroy_workqueue(z_erofs_workqueue);
out_error_workqueue_init:
z_erofs_destroy_pcluster_pool(); z_erofs_destroy_pcluster_pool();
out_error_pcluster_pool: out_error_pcluster_pool:
return err; return err;
@@ -711,8 +735,14 @@ static const struct address_space_operations z_erofs_cache_aops = {
int erofs_init_managed_cache(struct super_block *sb) int erofs_init_managed_cache(struct super_block *sb)
{ {
struct inode *const inode = new_inode(sb); struct inode *inode;
int err;
err = z_erofs_init_pcpu_workers(sb);
if (err)
return err;
inode = new_inode(sb);
if (!inode) if (!inode)
return -ENOMEM; return -ENOMEM;

View File

@@ -799,6 +799,10 @@ int fuse_file_read_iter_initialize(
.size = to->count, .size = to->count,
}; };
fri->frio = (struct fuse_read_iter_out) {
.ret = fri->fri.size,
};
/* TODO we can't assume 'to' is a kvec */ /* TODO we can't assume 'to' is a kvec */
/* TODO we also can't assume the vector has only one component */ /* TODO we also can't assume the vector has only one component */
*fa = (struct fuse_bpf_args) { *fa = (struct fuse_bpf_args) {
@@ -833,6 +837,11 @@ int fuse_file_read_iter_backing(struct fuse_bpf_args *fa,
if (!iov_iter_count(to)) if (!iov_iter_count(to))
return 0; return 0;
if ((iocb->ki_flags & IOCB_DIRECT) &&
(!ff->backing_file->f_mapping->a_ops ||
!ff->backing_file->f_mapping->a_ops->direct_IO))
return -EINVAL;
/* TODO This just plain ignores any change to fuse_read_in */ /* TODO This just plain ignores any change to fuse_read_in */
if (is_sync_kiocb(iocb)) { if (is_sync_kiocb(iocb)) {
ret = vfs_iter_read(ff->backing_file, to, &iocb->ki_pos, ret = vfs_iter_read(ff->backing_file, to, &iocb->ki_pos,
@@ -855,13 +864,14 @@ int fuse_file_read_iter_backing(struct fuse_bpf_args *fa,
fuse_bpf_aio_cleanup_handler(aio_req); fuse_bpf_aio_cleanup_handler(aio_req);
} }
frio->ret = ret;
/* TODO Need to point value at the buffer for post-modification */ /* TODO Need to point value at the buffer for post-modification */
out: out:
fuse_file_accessed(file, ff->backing_file); fuse_file_accessed(file, ff->backing_file);
frio->ret = ret; return ret;
return ret < 0 ? ret : 0;
} }
void *fuse_file_read_iter_finalize(struct fuse_bpf_args *fa, void *fuse_file_read_iter_finalize(struct fuse_bpf_args *fa,

View File

@@ -41,6 +41,24 @@ struct poll_table_struct;
/* define the enumeration of all cgroup subsystems */ /* define the enumeration of all cgroup subsystems */
#define SUBSYS(_x) _x ## _cgrp_id, #define SUBSYS(_x) _x ## _cgrp_id,
#define CSS_COUNTERS_SIZE (CGROUP_SUBSYS_COUNT * sizeof(atomic_t))
/*
* This should just use max(), but max() doesn't work in struct definitions.
*
* Originally, the space was reserved for per cgroup subsystem counters, where each counter was
* the size of an atomic_t variable. However, it was later reused to fit a struct rcu_head
* which is why the calculation considers the size of struct rcu_head.
*
* This macro is provided to ANDROID_BACKPORT_USE_ARRAY() which needs to reserve at least
* enough memory to accommodate struct rcu_head. However, if we only reserve CSS_COUNTERS_SIZE,
* that may not be enough space on kernels with a small amount of cgroup subsystems enabled. So,
* we take the max between the two values to use in ANDROID_BACKPORT_USE_ARRAY().
*/
#define CGROUP_ROOT_BACKPORT_PADDING_SIZE \
(CSS_COUNTERS_SIZE > sizeof(struct rcu_head) ? CSS_COUNTERS_SIZE : sizeof(struct rcu_head))
enum cgroup_subsys_id { enum cgroup_subsys_id {
#include <linux/cgroup_subsys.h> #include <linux/cgroup_subsys.h>
CGROUP_SUBSYS_COUNT, CGROUP_SUBSYS_COUNT,
@@ -585,8 +603,12 @@ struct cgroup_root {
/* The name for this hierarchy - may be empty */ /* The name for this hierarchy - may be empty */
char name[MAX_CGROUP_ROOT_NAMELEN]; char name[MAX_CGROUP_ROOT_NAMELEN];
ANDROID_BACKPORT_USE_ARRAY(1, CGROUP_SUBSYS_COUNT * sizeof(atomic_t), /* Use the original calculation to preserve the CRC value for the ABI. */
struct rcu_head rcu); #ifndef __GENKSYMS__
ANDROID_BACKPORT_USE_ARRAY(1, CGROUP_ROOT_BACKPORT_PADDING_SIZE, struct rcu_head rcu);
#else
ANDROID_BACKPORT_USE_ARRAY(1, CGROUP_SUBSYS_COUNT * sizeof(atomic_t), struct rcu_head rcu);
#endif
}; };
/* /*

View File

@@ -277,15 +277,25 @@ struct mthp_stat {
#ifdef CONFIG_SYSFS #ifdef CONFIG_SYSFS
DECLARE_PER_CPU(struct mthp_stat, mthp_stats); DECLARE_PER_CPU(struct mthp_stat, mthp_stats);
static inline void count_mthp_stat(int order, enum mthp_stat_item item) static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
{ {
if (order <= 0 || order > PMD_ORDER) if (order <= 0 || order > PMD_ORDER)
return; return;
this_cpu_inc(mthp_stats.stats[order][item]); this_cpu_add(mthp_stats.stats[order][item], delta);
} }
static inline void count_mthp_stat(int order, enum mthp_stat_item item)
{
mod_mthp_stat(order, item, 1);
}
unsigned long sum_mthp_stat(int order, enum mthp_stat_item item); unsigned long sum_mthp_stat(int order, enum mthp_stat_item item);
#else #else
static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
{
}
static inline void count_mthp_stat(int order, enum mthp_stat_item item) static inline void count_mthp_stat(int order, enum mthp_stat_item item)
{ {
} }
@@ -326,7 +336,7 @@ static inline int split_huge_page(struct page *page)
{ {
return split_huge_page_to_list(page, NULL); return split_huge_page_to_list(page, NULL);
} }
void deferred_split_folio(struct folio *folio); void deferred_split_folio(struct folio *folio, bool partially_mapped);
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long address, bool freeze, struct folio *folio); unsigned long address, bool freeze, struct folio *folio);
@@ -486,7 +496,7 @@ static inline int split_huge_page(struct page *page)
{ {
return 0; return 0;
} }
static inline void deferred_split_folio(struct folio *folio) {} static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
#define split_huge_pmd(__vma, __pmd, __address) \ #define split_huge_pmd(__vma, __pmd, __address) \
do { } while (0) do { } while (0)

View File

@@ -4,6 +4,7 @@
#include <linux/sched/coredump.h> /* MMF_VM_HUGEPAGE */ #include <linux/sched/coredump.h> /* MMF_VM_HUGEPAGE */
extern unsigned int khugepaged_max_ptes_none __read_mostly;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern struct attribute_group khugepaged_attr_group; extern struct attribute_group khugepaged_attr_group;

View File

@@ -731,8 +731,15 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
__mem_cgroup_uncharge_list(page_list); __mem_cgroup_uncharge_list(page_list);
} }
void mem_cgroup_replace_folio(struct folio *old, struct folio *new); void __mem_cgroup_uncharge_folios(struct folio_batch *folios);
static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios)
{
if (mem_cgroup_disabled())
return;
__mem_cgroup_uncharge_folios(folios);
}
void mem_cgroup_replace_folio(struct folio *old, struct folio *new);
void mem_cgroup_migrate(struct folio *old, struct folio *new); void mem_cgroup_migrate(struct folio *old, struct folio *new);
/** /**
@@ -1171,6 +1178,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
gfp_t gfp_mask, gfp_t gfp_mask,
unsigned long *total_scanned); unsigned long *total_scanned);
extern int mem_cgroup_init(void);
#else /* CONFIG_MEMCG */ #else /* CONFIG_MEMCG */
#define MEM_CGROUP_ID_SHIFT 0 #define MEM_CGROUP_ID_SHIFT 0
@@ -1297,6 +1305,10 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
{ {
} }
static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios)
{
}
static inline void mem_cgroup_replace_folio(struct folio *old, static inline void mem_cgroup_replace_folio(struct folio *old,
struct folio *new) struct folio *new)
{ {
@@ -1619,6 +1631,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
{ {
return 0; return 0;
} }
static inline int mem_cgroup_init(void) { return 0; }
#endif /* CONFIG_MEMCG */ #endif /* CONFIG_MEMCG */
static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx) static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
@@ -1682,18 +1696,18 @@ static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio,
return folio_lruvec_lock_irq(folio); return folio_lruvec_lock_irq(folio);
} }
/* Don't lock again iff page's lruvec locked */ /* Don't lock again iff folio's lruvec locked */
static inline struct lruvec *folio_lruvec_relock_irqsave(struct folio *folio, static inline void folio_lruvec_relock_irqsave(struct folio *folio,
struct lruvec *locked_lruvec, unsigned long *flags) struct lruvec **lruvecp, unsigned long *flags)
{ {
if (locked_lruvec) { if (*lruvecp) {
if (folio_matches_lruvec(folio, locked_lruvec)) if (folio_matches_lruvec(folio, *lruvecp))
return locked_lruvec; return;
unlock_page_lruvec_irqrestore(locked_lruvec, *flags); unlock_page_lruvec_irqrestore(*lruvecp, *flags);
} }
return folio_lruvec_lock_irqsave(folio, flags); *lruvecp = folio_lruvec_lock_irqsave(folio, flags);
} }
#ifdef CONFIG_CGROUP_WRITEBACK #ifdef CONFIG_CGROUP_WRITEBACK

View File

@@ -39,6 +39,7 @@ struct anon_vma;
struct anon_vma_chain; struct anon_vma_chain;
struct user_struct; struct user_struct;
struct pt_regs; struct pt_regs;
struct folio_batch;
extern int sysctl_page_lock_unfairness; extern int sysctl_page_lock_unfairness;
@@ -1539,6 +1540,8 @@ static inline void folio_put_refs(struct folio *folio, int refs)
__folio_put(folio); __folio_put(folio);
} }
void folios_put_refs(struct folio_batch *folios, unsigned int *refs);
/* /*
* union release_pages_arg - an array of pages or folios * union release_pages_arg - an array of pages or folios
* *
@@ -1561,18 +1564,19 @@ void release_pages(release_pages_arg, int nr);
/** /**
* folios_put - Decrement the reference count on an array of folios. * folios_put - Decrement the reference count on an array of folios.
* @folios: The folios. * @folios: The folios.
* @nr: How many folios there are.
* *
* Like folio_put(), but for an array of folios. This is more efficient * Like folio_put(), but for a batch of folios. This is more efficient
* than writing the loop yourself as it will optimise the locks which * than writing the loop yourself as it will optimise the locks which need
* need to be taken if the folios are freed. * to be taken if the folios are freed. The folios batch is returned
* empty and ready to be reused for another batch; there is no need to
* reinitialise it.
* *
* Context: May be called in process or interrupt context, but not in NMI * Context: May be called in process or interrupt context, but not in NMI
* context. May be called while holding a spinlock. * context. May be called while holding a spinlock.
*/ */
static inline void folios_put(struct folio **folios, unsigned int nr) static inline void folios_put(struct folio_batch *folios)
{ {
release_pages(folios, nr); folios_put_refs(folios, NULL);
} }
static inline void put_page(struct page *page) static inline void put_page(struct page *page)

View File

@@ -37,6 +37,22 @@
#define NR_PAGE_ORDERS (MAX_ORDER + 1) #define NR_PAGE_ORDERS (MAX_ORDER + 1)
/* Defines the order for the number of pages that have a migrate type. */
#ifndef CONFIG_PAGE_BLOCK_ORDER
#define PAGE_BLOCK_ORDER MAX_ORDER
#else
#define PAGE_BLOCK_ORDER CONFIG_PAGE_BLOCK_ORDER
#endif /* CONFIG_PAGE_BLOCK_ORDER */
/*
* The MAX_ORDER, which defines the max order of pages to be allocated
* by the buddy allocator, has to be larger or equal to the PAGE_BLOCK_ORDER,
* which defines the order for the number of pages that can have a migrate type
*/
#if (PAGE_BLOCK_ORDER > MAX_ORDER)
#error MAX_ORDER must be >= PAGE_BLOCK_ORDER
#endif
/* /*
* PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
* costly to service. That is between allocation orders which should * costly to service. That is between allocation orders which should

View File

@@ -197,6 +197,7 @@ enum pageflags {
/* At least one page in this folio has the hwpoison flag set */ /* At least one page in this folio has the hwpoison flag set */
PG_has_hwpoisoned = PG_error, PG_has_hwpoisoned = PG_error,
PG_large_rmappable = PG_workingset, /* anon or file-backed */ PG_large_rmappable = PG_workingset, /* anon or file-backed */
PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */
}; };
#define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1) #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1)
@@ -372,54 +373,77 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n)
#define FOLIO_PF_NO_COMPOUND 0 #define FOLIO_PF_NO_COMPOUND 0
#define FOLIO_PF_SECOND 1 #define FOLIO_PF_SECOND 1
#define FOLIO_HEAD_PAGE 0
#define FOLIO_SECOND_PAGE 1
/* /*
* Macros to create function definitions for page flags * Macros to create function definitions for page flags
*/ */
#define FOLIO_TEST_FLAG(name, page) \
static __always_inline bool folio_test_##name(struct folio *folio) \
{ return test_bit(PG_##name, folio_flags(folio, page)); }
#define FOLIO_SET_FLAG(name, page) \
static __always_inline void folio_set_##name(struct folio *folio) \
{ set_bit(PG_##name, folio_flags(folio, page)); }
#define FOLIO_CLEAR_FLAG(name, page) \
static __always_inline void folio_clear_##name(struct folio *folio) \
{ clear_bit(PG_##name, folio_flags(folio, page)); }
#define __FOLIO_SET_FLAG(name, page) \
static __always_inline void __folio_set_##name(struct folio *folio) \
{ __set_bit(PG_##name, folio_flags(folio, page)); }
#define __FOLIO_CLEAR_FLAG(name, page) \
static __always_inline void __folio_clear_##name(struct folio *folio) \
{ __clear_bit(PG_##name, folio_flags(folio, page)); }
#define FOLIO_TEST_SET_FLAG(name, page) \
static __always_inline bool folio_test_set_##name(struct folio *folio) \
{ return test_and_set_bit(PG_##name, folio_flags(folio, page)); }
#define FOLIO_TEST_CLEAR_FLAG(name, page) \
static __always_inline bool folio_test_clear_##name(struct folio *folio) \
{ return test_and_clear_bit(PG_##name, folio_flags(folio, page)); }
#define FOLIO_FLAG(name, page) \
FOLIO_TEST_FLAG(name, page) \
FOLIO_SET_FLAG(name, page) \
FOLIO_CLEAR_FLAG(name, page)
#define TESTPAGEFLAG(uname, lname, policy) \ #define TESTPAGEFLAG(uname, lname, policy) \
static __always_inline bool folio_test_##lname(struct folio *folio) \ FOLIO_TEST_FLAG(lname, FOLIO_##policy) \
{ return test_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \
static __always_inline int Page##uname(struct page *page) \ static __always_inline int Page##uname(struct page *page) \
{ return test_bit(PG_##lname, &policy(page, 0)->flags); } { return test_bit(PG_##lname, &policy(page, 0)->flags); }
#define SETPAGEFLAG(uname, lname, policy) \ #define SETPAGEFLAG(uname, lname, policy) \
static __always_inline \ FOLIO_SET_FLAG(lname, FOLIO_##policy) \
void folio_set_##lname(struct folio *folio) \
{ set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \
static __always_inline void SetPage##uname(struct page *page) \ static __always_inline void SetPage##uname(struct page *page) \
{ set_bit(PG_##lname, &policy(page, 1)->flags); } { set_bit(PG_##lname, &policy(page, 1)->flags); }
#define CLEARPAGEFLAG(uname, lname, policy) \ #define CLEARPAGEFLAG(uname, lname, policy) \
static __always_inline \ FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \
void folio_clear_##lname(struct folio *folio) \
{ clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \
static __always_inline void ClearPage##uname(struct page *page) \ static __always_inline void ClearPage##uname(struct page *page) \
{ clear_bit(PG_##lname, &policy(page, 1)->flags); } { clear_bit(PG_##lname, &policy(page, 1)->flags); }
#define __SETPAGEFLAG(uname, lname, policy) \ #define __SETPAGEFLAG(uname, lname, policy) \
static __always_inline \ __FOLIO_SET_FLAG(lname, FOLIO_##policy) \
void __folio_set_##lname(struct folio *folio) \
{ __set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \
static __always_inline void __SetPage##uname(struct page *page) \ static __always_inline void __SetPage##uname(struct page *page) \
{ __set_bit(PG_##lname, &policy(page, 1)->flags); } { __set_bit(PG_##lname, &policy(page, 1)->flags); }
#define __CLEARPAGEFLAG(uname, lname, policy) \ #define __CLEARPAGEFLAG(uname, lname, policy) \
static __always_inline \ __FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \
void __folio_clear_##lname(struct folio *folio) \
{ __clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \
static __always_inline void __ClearPage##uname(struct page *page) \ static __always_inline void __ClearPage##uname(struct page *page) \
{ __clear_bit(PG_##lname, &policy(page, 1)->flags); } { __clear_bit(PG_##lname, &policy(page, 1)->flags); }
#define TESTSETFLAG(uname, lname, policy) \ #define TESTSETFLAG(uname, lname, policy) \
static __always_inline \ FOLIO_TEST_SET_FLAG(lname, FOLIO_##policy) \
bool folio_test_set_##lname(struct folio *folio) \
{ return test_and_set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \
static __always_inline int TestSetPage##uname(struct page *page) \ static __always_inline int TestSetPage##uname(struct page *page) \
{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); } { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }
#define TESTCLEARFLAG(uname, lname, policy) \ #define TESTCLEARFLAG(uname, lname, policy) \
static __always_inline \ FOLIO_TEST_CLEAR_FLAG(lname, FOLIO_##policy) \
bool folio_test_clear_##lname(struct folio *folio) \
{ return test_and_clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \
static __always_inline int TestClearPage##uname(struct page *page) \ static __always_inline int TestClearPage##uname(struct page *page) \
{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
@@ -842,8 +866,18 @@ static inline void ClearPageCompound(struct page *page)
ClearPageHead(page); ClearPageHead(page);
} }
PAGEFLAG(LargeRmappable, large_rmappable, PF_SECOND) PAGEFLAG(LargeRmappable, large_rmappable, PF_SECOND)
FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
/*
* PG_partially_mapped is protected by deferred_split split_queue_lock,
* so its safe to use non-atomic set/clear.
*/
__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
#else #else
TESTPAGEFLAG_FALSE(LargeRmappable, large_rmappable) TESTPAGEFLAG_FALSE(LargeRmappable, large_rmappable)
FOLIO_TEST_FLAG_FALSE(partially_mapped)
__FOLIO_SET_FLAG_NOOP(partially_mapped)
__FOLIO_CLEAR_FLAG_NOOP(partially_mapped)
#endif #endif
#define PG_head_mask ((1UL << PG_head)) #define PG_head_mask ((1UL << PG_head))
@@ -1111,7 +1145,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
*/ */
#define PAGE_FLAGS_SECOND \ #define PAGE_FLAGS_SECOND \
(0xffUL /* order */ | 1UL << PG_has_hwpoisoned | \ (0xffUL /* order */ | 1UL << PG_has_hwpoisoned | \
1UL << PG_large_rmappable) 1UL << PG_large_rmappable | 1UL << PG_partially_mapped)
#define PAGE_FLAGS_PRIVATE \ #define PAGE_FLAGS_PRIVATE \
(1UL << PG_private | 1UL << PG_private_2) (1UL << PG_private | 1UL << PG_private_2)

View File

@@ -3,10 +3,6 @@
#define __LINUX_PAGEISOLATION_H #define __LINUX_PAGEISOLATION_H
#ifdef CONFIG_MEMORY_ISOLATION #ifdef CONFIG_MEMORY_ISOLATION
static inline bool has_isolate_pageblock(struct zone *zone)
{
return zone->nr_isolate_pageblock;
}
static inline bool is_migrate_isolate_page(struct page *page) static inline bool is_migrate_isolate_page(struct page *page)
{ {
return get_pageblock_migratetype(page) == MIGRATE_ISOLATE; return get_pageblock_migratetype(page) == MIGRATE_ISOLATE;
@@ -16,10 +12,6 @@ static inline bool is_migrate_isolate(int migratetype)
return migratetype == MIGRATE_ISOLATE; return migratetype == MIGRATE_ISOLATE;
} }
#else #else
static inline bool has_isolate_pageblock(struct zone *zone)
{
return false;
}
static inline bool is_migrate_isolate_page(struct page *page) static inline bool is_migrate_isolate_page(struct page *page)
{ {
return false; return false;

View File

@@ -28,7 +28,7 @@ enum pageblock_bits {
NR_PAGEBLOCK_BITS NR_PAGEBLOCK_BITS
}; };
#ifdef CONFIG_HUGETLB_PAGE #if defined(CONFIG_HUGETLB_PAGE)
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -41,14 +41,18 @@ extern unsigned int pageblock_order;
* Huge pages are a constant size, but don't exceed the maximum allocation * Huge pages are a constant size, but don't exceed the maximum allocation
* granularity. * granularity.
*/ */
#define pageblock_order min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_ORDER) #define pageblock_order min_t(unsigned int, HUGETLB_PAGE_ORDER, PAGE_BLOCK_ORDER)
#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
#else /* CONFIG_HUGETLB_PAGE */ #elif defined(CONFIG_TRANSPARENT_HUGEPAGE)
#define pageblock_order min_t(unsigned int, HPAGE_PMD_ORDER, PAGE_BLOCK_ORDER)
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
/* If huge pages are not used, group by MAX_ORDER_NR_PAGES */ /* If huge pages are not used, group by MAX_ORDER_NR_PAGES */
#define pageblock_order MAX_ORDER #define pageblock_order PAGE_BLOCK_ORDER
#endif /* CONFIG_HUGETLB_PAGE */ #endif /* CONFIG_HUGETLB_PAGE */

View File

@@ -742,7 +742,12 @@ int folio_mkclean(struct folio *);
int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
struct vm_area_struct *vma); struct vm_area_struct *vma);
void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked); enum rmp_flags {
RMP_LOCKED = 1 << 0,
RMP_USE_SHARED_ZEROPAGE = 1 << 1,
};
void remove_migration_ptes(struct folio *src, struct folio *dst, int flags);
int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);

View File

@@ -52,6 +52,8 @@ int trace_array_printk(struct trace_array *tr, unsigned long ip,
int trace_array_init_printk(struct trace_array *tr); int trace_array_init_printk(struct trace_array *tr);
void trace_array_put(struct trace_array *tr); void trace_array_put(struct trace_array *tr);
struct trace_array *trace_array_get_by_name(const char *name); struct trace_array *trace_array_get_by_name(const char *name);
struct trace_array *trace_array_get_by_name_ext(const char *name,
const char *systems);
int trace_array_destroy(struct trace_array *tr); int trace_array_destroy(struct trace_array *tr);
/* For osnoise tracer */ /* For osnoise tracer */
@@ -88,6 +90,11 @@ static inline struct trace_array *trace_array_get_by_name(const char *name)
{ {
return NULL; return NULL;
} }
static inline struct trace_array *trace_array_get_by_name_ext(
const char *name, const char *systems)
{
return NULL;
}
static inline int trace_array_destroy(struct trace_array *tr) static inline int trace_array_destroy(struct trace_array *tr)
{ {
return 0; return 0;

View File

@@ -8,21 +8,46 @@
#include <linux/refcount.h> #include <linux/refcount.h>
#include <net/sock.h> #include <net/sock.h>
void unix_inflight(struct user_struct *user, struct file *fp); #if IS_ENABLED(CONFIG_UNIX)
void unix_notinflight(struct user_struct *user, struct file *fp);
void unix_destruct_scm(struct sk_buff *skb);
void io_uring_destruct_scm(struct sk_buff *skb);
void unix_gc(void);
void wait_for_unix_gc(void);
struct unix_sock *unix_get_socket(struct file *filp); struct unix_sock *unix_get_socket(struct file *filp);
#else
static inline struct unix_sock *unix_get_socket(struct file *filp)
{
return NULL;
}
#endif
extern unsigned int unix_tot_inflight;
void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver);
void unix_del_edges(struct scm_fp_list *fpl);
void unix_update_edges(struct unix_sock *receiver);
int unix_prepare_fpl(struct scm_fp_list *fpl);
void unix_destroy_fpl(struct scm_fp_list *fpl);
void unix_gc(void);
void wait_for_unix_gc(struct scm_fp_list *fpl);
struct unix_vertex {
struct list_head edges;
struct list_head entry;
struct list_head scc_entry;
unsigned long out_degree;
unsigned long index;
unsigned long scc_index;
};
struct unix_edge {
struct unix_sock *predecessor;
struct unix_sock *successor;
struct list_head vertex_entry;
struct list_head stack_entry;
};
struct sock *unix_peer_get(struct sock *sk); struct sock *unix_peer_get(struct sock *sk);
#define UNIX_HASH_MOD (256 - 1) #define UNIX_HASH_MOD (256 - 1)
#define UNIX_HASH_SIZE (256 * 2) #define UNIX_HASH_SIZE (256 * 2)
#define UNIX_HASH_BITS 8 #define UNIX_HASH_BITS 8
extern unsigned int unix_tot_inflight;
struct unix_address { struct unix_address {
refcount_t refcnt; refcount_t refcnt;
int len; int len;
@@ -42,6 +67,7 @@ struct unix_skb_parms {
struct scm_stat { struct scm_stat {
atomic_t nr_fds; atomic_t nr_fds;
unsigned long nr_unix_fds;
}; };
#define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb)) #define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb))
@@ -54,12 +80,9 @@ struct unix_sock {
struct path path; struct path path;
struct mutex iolock, bindlock; struct mutex iolock, bindlock;
struct sock *peer; struct sock *peer;
struct list_head link; struct unix_vertex *vertex;
unsigned long inflight; struct sock *listener;
spinlock_t lock; spinlock_t lock;
unsigned long gc_flags;
#define UNIX_GC_CANDIDATE 0
#define UNIX_GC_MAYBE_CYCLE 1
struct socket_wq peer_wq; struct socket_wq peer_wq;
wait_queue_entry_t peer_wake; wait_queue_entry_t peer_wake;
struct scm_stat scm_stat; struct scm_stat scm_stat;

View File

@@ -22,11 +22,24 @@ struct scm_creds {
kgid_t gid; kgid_t gid;
}; };
#ifdef CONFIG_UNIX
struct unix_edge;
#endif
struct scm_fp_list { struct scm_fp_list {
short count; short count;
short max; short max;
struct user_struct *user; struct user_struct *user;
struct file *fp[SCM_MAX_FD]; struct file *fp[SCM_MAX_FD];
#ifndef __GENKSYMS__
#ifdef CONFIG_UNIX
bool inflight;
bool dead;
struct list_head vertices;
struct unix_edge *edges;
#endif
short count_unix;
#endif
}; };
struct scm_cookie { struct scm_cookie {

View File

@@ -431,6 +431,9 @@ DECLARE_HOOK(android_vh_add_lazyfree_bypass,
DECLARE_HOOK(android_vh_do_async_mmap_readahead, DECLARE_HOOK(android_vh_do_async_mmap_readahead,
TP_PROTO(struct vm_fault *vmf, struct folio *folio, bool *skip), TP_PROTO(struct vm_fault *vmf, struct folio *folio, bool *skip),
TP_ARGS(vmf, folio, skip)); TP_ARGS(vmf, folio, skip));
DECLARE_HOOK(android_vh_mm_free_page,
TP_PROTO(struct page *page),
TP_ARGS(page));
DECLARE_HOOK(android_vh_cma_debug_show_areas, DECLARE_HOOK(android_vh_cma_debug_show_areas,
TP_PROTO(bool *show), TP_PROTO(bool *show),
@@ -596,6 +599,9 @@ DECLARE_HOOK(android_vh_folio_remove_rmap_ptes,
DECLARE_HOOK(android_vh_pageset_update, DECLARE_HOOK(android_vh_pageset_update,
TP_PROTO(unsigned long *high, unsigned long *batch), TP_PROTO(unsigned long *high, unsigned long *batch),
TP_ARGS(high, batch)); TP_ARGS(high, batch));
DECLARE_HOOK(android_vh_mempool_alloc_skip_wait,
TP_PROTO(gfp_t *gfp_flags, bool *skip_wait),
TP_ARGS(gfp_flags, skip_wait));
#endif /* _TRACE_HOOK_MM_H */ #endif /* _TRACE_HOOK_MM_H */
/* This part must be outside protection */ /* This part must be outside protection */

View File

@@ -31,6 +31,10 @@ DECLARE_HOOK(android_vh_usb_new_device_added,
TP_PROTO(struct usb_device *udev, int *err), TP_PROTO(struct usb_device *udev, int *err),
TP_ARGS(udev, err)); TP_ARGS(udev, err));
DECLARE_HOOK(android_vh_xhci_full_reset_on_remove,
TP_PROTO(bool *full_reset),
TP_ARGS(full_reset));
#endif /* _TRACE_HOOK_USB_H */ #endif /* _TRACE_HOOK_USB_H */
/* This part must be outside protection */ /* This part must be outside protection */
#include <trace/define_trace.h> #include <trace/define_trace.h>

View File

@@ -50,6 +50,7 @@
#include <linux/writeback.h> #include <linux/writeback.h>
#include <linux/cpu.h> #include <linux/cpu.h>
#include <linux/cpuset.h> #include <linux/cpuset.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h> #include <linux/cgroup.h>
#include <linux/efi.h> #include <linux/efi.h>
#include <linux/tick.h> #include <linux/tick.h>
@@ -1062,6 +1063,7 @@ void start_kernel(void)
proc_root_init(); proc_root_init();
nsfs_init(); nsfs_init();
cpuset_init(); cpuset_init();
mem_cgroup_init();
cgroup_init(); cgroup_init();
taskstats_init_early(); taskstats_init_early();
delayacct_init(); delayacct_init();

View File

@@ -452,7 +452,7 @@ struct kmem_cache *files_cachep;
struct kmem_cache *fs_cachep; struct kmem_cache *fs_cachep;
/* SLAB cache for vm_area_struct structures */ /* SLAB cache for vm_area_struct structures */
static struct kmem_cache *vm_area_cachep; struct kmem_cache *vm_area_cachep;
/* SLAB cache for mm_struct structures (tsk->mm) */ /* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep; static struct kmem_cache *mm_cachep;

View File

@@ -227,6 +227,14 @@ __irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff,
void irq_startup_managed(struct irq_desc *desc) void irq_startup_managed(struct irq_desc *desc)
{ {
struct irq_data *d = irq_desc_get_irq_data(desc);
/*
* Clear managed-shutdown flag, so we don't repeat managed-startup for
* multiple hotplugs, and cause imbalanced disable depth.
*/
irqd_clr_managed_shutdown(d);
/* /*
* Only start it up when the disable depth is 1, so that a disable, * Only start it up when the disable depth is 1, so that a disable,
* hotunplug, hotplug sequence does not end up enabling it during * hotunplug, hotplug sequence does not end up enabling it during

View File

@@ -211,13 +211,6 @@ static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu)
!irq_data_get_irq_chip(data) || !cpumask_test_cpu(cpu, affinity)) !irq_data_get_irq_chip(data) || !cpumask_test_cpu(cpu, affinity))
return; return;
/*
* Don't restore suspended interrupts here when a system comes back
* from S3. They are reenabled via resume_device_irqs().
*/
if (desc->istate & IRQS_SUSPENDED)
return;
if (irqd_is_managed_and_shutdown(data)) if (irqd_is_managed_and_shutdown(data))
irq_startup_managed(desc); irq_startup_managed(desc);

View File

@@ -9538,16 +9538,19 @@ static int trace_array_create_dir(struct trace_array *tr)
return ret; return ret;
} }
static struct trace_array *trace_array_create(const char *name) static struct trace_array *
trace_array_create_systems(const char *name, const char *systems)
{ {
struct trace_array_ext *tr_ext;
struct trace_array *tr; struct trace_array *tr;
int ret; int ret;
ret = -ENOMEM; ret = -ENOMEM;
tr = kzalloc(sizeof(*tr), GFP_KERNEL); tr_ext = kzalloc(sizeof(*tr_ext), GFP_KERNEL);
if (!tr) if (!tr_ext)
return ERR_PTR(ret); return ERR_PTR(ret);
tr = &tr_ext->trace_array;
tr->name = kstrdup(name, GFP_KERNEL); tr->name = kstrdup(name, GFP_KERNEL);
if (!tr->name) if (!tr->name)
goto out_free_tr; goto out_free_tr;
@@ -9558,6 +9561,12 @@ static struct trace_array *trace_array_create(const char *name)
if (!zalloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL)) if (!zalloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL))
goto out_free_tr; goto out_free_tr;
if (systems) {
tr_ext->system_names = kstrdup_const(systems, GFP_KERNEL);
if (!tr_ext->system_names)
goto out_free_tr;
}
tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS; tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
cpumask_copy(tr->tracing_cpumask, cpu_all_mask); cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
@@ -9601,12 +9610,18 @@ static struct trace_array *trace_array_create(const char *name)
free_trace_buffers(tr); free_trace_buffers(tr);
free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask); free_cpumask_var(tr->tracing_cpumask);
kfree_const(tr_ext->system_names);
kfree(tr->name); kfree(tr->name);
kfree(tr); kfree(tr_ext);
return ERR_PTR(ret); return ERR_PTR(ret);
} }
static struct trace_array *trace_array_create(const char *name)
{
return trace_array_create_systems(name, NULL);
}
static int instance_mkdir(const char *name) static int instance_mkdir(const char *name)
{ {
struct trace_array *tr; struct trace_array *tr;
@@ -9629,9 +9644,27 @@ out_unlock:
return ret; return ret;
} }
const char *trace_array_get_system_names(struct trace_array *tr)
{
struct trace_array_ext *tr_ext;
if (tr == &global_trace)
return NULL;
tr_ext = container_of(tr, struct trace_array_ext, trace_array);
return tr_ext->system_names;
}
struct trace_array *trace_array_get_by_name(const char *name)
{
return trace_array_get_by_name_ext(name, NULL);
}
EXPORT_SYMBOL_GPL(trace_array_get_by_name);
/** /**
* trace_array_get_by_name - Create/Lookup a trace array, given its name. * trace_array_get_by_name_ext - Create/Lookup a trace array, given its name.
* @name: The name of the trace array to be looked up/created. * @name: The name of the trace array to be looked up/created.
* @systems: A list of systems to create event directories for (NULL for all)
* *
* Returns pointer to trace array with given name. * Returns pointer to trace array with given name.
* NULL, if it cannot be created. * NULL, if it cannot be created.
@@ -9645,7 +9678,8 @@ out_unlock:
* trace_array_put() is called, user space can not delete it. * trace_array_put() is called, user space can not delete it.
* *
*/ */
struct trace_array *trace_array_get_by_name(const char *name) struct trace_array *trace_array_get_by_name_ext(const char *name,
const char *systems)
{ {
struct trace_array *tr; struct trace_array *tr;
@@ -9657,7 +9691,7 @@ struct trace_array *trace_array_get_by_name(const char *name)
goto out_unlock; goto out_unlock;
} }
tr = trace_array_create(name); tr = trace_array_create_systems(name, systems);
if (IS_ERR(tr)) if (IS_ERR(tr))
tr = NULL; tr = NULL;
@@ -9669,11 +9703,14 @@ out_unlock:
mutex_unlock(&event_mutex); mutex_unlock(&event_mutex);
return tr; return tr;
} }
EXPORT_SYMBOL_GPL(trace_array_get_by_name); EXPORT_SYMBOL_GPL(trace_array_get_by_name_ext);
static int __remove_instance(struct trace_array *tr) static int __remove_instance(struct trace_array *tr)
{ {
int i; int i;
struct trace_array_ext *tr_ext = container_of(tr,
struct trace_array_ext,
trace_array);
/* Reference counter for a newly created trace array = 1. */ /* Reference counter for a newly created trace array = 1. */
if (tr->ref > 1 || (tr->current_trace && tr->trace_ref)) if (tr->ref > 1 || (tr->current_trace && tr->trace_ref))
@@ -9704,8 +9741,9 @@ static int __remove_instance(struct trace_array *tr)
free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask); free_cpumask_var(tr->tracing_cpumask);
kfree_const(tr_ext->system_names);
kfree(tr->name); kfree(tr->name);
kfree(tr); kfree(tr_ext);
return 0; return 0;
} }

View File

@@ -412,6 +412,11 @@ struct trace_array {
struct trace_func_repeats __percpu *last_func_repeats; struct trace_func_repeats __percpu *last_func_repeats;
}; };
struct trace_array_ext {
const char *system_names;
struct trace_array trace_array;
};
enum { enum {
TRACE_ARRAY_FL_GLOBAL = (1 << 0) TRACE_ARRAY_FL_GLOBAL = (1 << 0)
}; };
@@ -420,6 +425,7 @@ extern struct list_head ftrace_trace_arrays;
extern struct mutex trace_types_lock; extern struct mutex trace_types_lock;
extern const char *trace_array_get_system_names(struct trace_array *tr);
extern int trace_array_get(struct trace_array *tr); extern int trace_array_get(struct trace_array *tr);
extern int tracing_check_open_get_tr(struct trace_array *tr); extern int tracing_check_open_get_tr(struct trace_array *tr);
extern struct trace_array *trace_array_find(const char *instance); extern struct trace_array *trace_array_find(const char *instance);

View File

@@ -3041,6 +3041,27 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
up_write(&trace_event_sem); up_write(&trace_event_sem);
} }
static bool event_in_systems(struct trace_event_call *call,
const char *systems)
{
const char *system;
const char *p;
if (!systems)
return true;
system = call->class->system;
p = strstr(systems, system);
if (!p)
return false;
if (p != systems && !isspace(*(p - 1)) && *(p - 1) != ',')
return false;
p += strlen(system);
return !*p || isspace(*p) || *p == ',';
}
static struct trace_event_file * static struct trace_event_file *
trace_create_new_event(struct trace_event_call *call, trace_create_new_event(struct trace_event_call *call,
struct trace_array *tr) struct trace_array *tr)
@@ -3050,9 +3071,12 @@ trace_create_new_event(struct trace_event_call *call,
struct trace_event_file *file; struct trace_event_file *file;
unsigned int first; unsigned int first;
if (!event_in_systems(call, trace_array_get_system_names(tr)))
return NULL;
file = kmem_cache_alloc(file_cachep, GFP_TRACE); file = kmem_cache_alloc(file_cachep, GFP_TRACE);
if (!file) if (!file)
return NULL; return ERR_PTR(-ENOMEM);
pid_list = rcu_dereference_protected(tr->filtered_pids, pid_list = rcu_dereference_protected(tr->filtered_pids,
lockdep_is_held(&event_mutex)); lockdep_is_held(&event_mutex));
@@ -3117,8 +3141,17 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr)
struct trace_event_file *file; struct trace_event_file *file;
file = trace_create_new_event(call, tr); file = trace_create_new_event(call, tr);
/*
* trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed
* allocation, or NULL if the event is not part of the tr->system_names.
* When the event is not part of the tr->system_names, return zero, not
* an error.
*/
if (!file) if (!file)
return -ENOMEM; return 0;
if (IS_ERR(file))
return PTR_ERR(file);
if (eventdir_initialized) if (eventdir_initialized)
return event_create_dir(tr->event_dir, file); return event_create_dir(tr->event_dir, file);
@@ -3157,8 +3190,17 @@ __trace_early_add_new_event(struct trace_event_call *call,
int ret; int ret;
file = trace_create_new_event(call, tr); file = trace_create_new_event(call, tr);
/*
* trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed
* allocation, or NULL if the event is not part of the tr->system_names.
* When the event is not part of the tr->system_names, return zero, not
* an error.
*/
if (!file) if (!file)
return -ENOMEM; return 0;
if (IS_ERR(file))
return PTR_ERR(file);
ret = event_define_fields(call); ret = event_define_fields(call);
if (ret) if (ret)

View File

@@ -994,6 +994,40 @@ config CMA_AREAS
If unsure, leave the default value "7" in UMA and "19" in NUMA. If unsure, leave the default value "7" in UMA and "19" in NUMA.
#
# Select this config option from the architecture Kconfig, if available, to set
# the max page order for physically contiguous allocations.
#
config ARCH_FORCE_MAX_ORDER
int
#
# When ARCH_FORCE_MAX_ORDER is not defined,
# the default page block order is MAX_PAGE_ORDER (10) as per
# include/linux/mmzone.h.
#
config PAGE_BLOCK_ORDER
int "Page Block Order"
range 1 10 if ARCH_FORCE_MAX_ORDER = 0 || ARCH_FORCE_MAX_ORDER = ""
default 10 if ARCH_FORCE_MAX_ORDER = 0 || ARCH_FORCE_MAX_ORDER = ""
range 1 ARCH_FORCE_MAX_ORDER if ARCH_FORCE_MAX_ORDER != 0
default ARCH_FORCE_MAX_ORDER if ARCH_FORCE_MAX_ORDER != 0
help
The page block order refers to the power of two number of pages that
are physically contiguous and can have a migrate type associated to
them. The maximum size of the page block order is limited by
ARCH_FORCE_MAX_ORDER.
This config allows overriding the default page block order when the
page block order is required to be smaller than ARCH_FORCE_MAX_ORDER
or MAX_ORDER.
Reducing pageblock order can negatively impact THP generation
success rate. If your workloads uses THP heavily, please use this
option with caution.
Don't change if unsure.
config MEM_SOFT_DIRTY config MEM_SOFT_DIRTY
bool "Track memory changes" bool "Track memory changes"
depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS

View File

@@ -70,6 +70,7 @@ unsigned long transparent_hugepage_flags __read_mostly =
(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
static struct shrinker deferred_split_shrinker; static struct shrinker deferred_split_shrinker;
static bool split_underused_thp = true;
static atomic_t huge_zero_refcount; static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly; struct page *huge_zero_page __read_mostly;
@@ -423,6 +424,27 @@ static ssize_t hpage_pmd_size_show(struct kobject *kobj,
static struct kobj_attribute hpage_pmd_size_attr = static struct kobj_attribute hpage_pmd_size_attr =
__ATTR_RO(hpage_pmd_size); __ATTR_RO(hpage_pmd_size);
static ssize_t split_underused_thp_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sysfs_emit(buf, "%d\n", split_underused_thp);
}
static ssize_t split_underused_thp_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
int err = kstrtobool(buf, &split_underused_thp);
if (err < 0)
return err;
return count;
}
static struct kobj_attribute split_underused_thp_attr = __ATTR(
shrink_underused, 0644, split_underused_thp_show, split_underused_thp_store);
static struct attribute *hugepage_attr[] = { static struct attribute *hugepage_attr[] = {
&enabled_attr.attr, &enabled_attr.attr,
&defrag_attr.attr, &defrag_attr.attr,
@@ -431,6 +453,7 @@ static struct attribute *hugepage_attr[] = {
#ifdef CONFIG_SHMEM #ifdef CONFIG_SHMEM
&shmem_enabled_attr.attr, &shmem_enabled_attr.attr,
#endif #endif
&split_underused_thp_attr.attr,
NULL, NULL,
}; };
@@ -1046,6 +1069,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm_inc_nr_ptes(vma->vm_mm); mm_inc_nr_ptes(vma->vm_mm);
deferred_split_folio(folio, false);
spin_unlock(vmf->ptl); spin_unlock(vmf->ptl);
count_vm_event(THP_FAULT_ALLOC); count_vm_event(THP_FAULT_ALLOC);
count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
@@ -2953,7 +2977,7 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio); return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
} }
static void remap_page(struct folio *folio, unsigned long nr) static void remap_page(struct folio *folio, unsigned long nr, int flags)
{ {
int i = 0; int i = 0;
@@ -2961,7 +2985,7 @@ static void remap_page(struct folio *folio, unsigned long nr)
if (!folio_test_anon(folio)) if (!folio_test_anon(folio))
return; return;
for (;;) { for (;;) {
remove_migration_ptes(folio, folio, true); remove_migration_ptes(folio, folio, RMP_LOCKED | flags);
i += folio_nr_pages(folio); i += folio_nr_pages(folio);
if (i >= nr) if (i >= nr)
break; break;
@@ -3314,7 +3338,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
if (nr_dropped) if (nr_dropped)
shmem_uncharge(head->mapping->host, nr_dropped); shmem_uncharge(head->mapping->host, nr_dropped);
remap_page(folio, nr); remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0);
for (i = 0; i < nr; i++) { for (i = 0; i < nr; i++) {
struct page *subpage = folio_dst_page(folio, i); struct page *subpage = folio_dst_page(folio, i);
@@ -3376,8 +3400,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
struct folio *folio = page_folio(page); struct folio *folio = page_folio(page);
struct deferred_split *ds_queue = get_deferred_split_queue(folio); struct deferred_split *ds_queue = get_deferred_split_queue(folio);
XA_STATE(xas, &folio->mapping->i_pages, folio->index); XA_STATE(xas, &folio->mapping->i_pages, folio->index);
struct anon_vma *anon_vma = NULL; bool is_anon = folio_test_anon(folio);
struct address_space *mapping = NULL; struct address_space *mapping = NULL;
struct anon_vma *anon_vma = NULL;
int extra_pins, ret; int extra_pins, ret;
pgoff_t end; pgoff_t end;
bool is_hzp; bool is_hzp;
@@ -3394,7 +3419,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
if (folio_test_writeback(folio)) if (folio_test_writeback(folio))
return -EBUSY; return -EBUSY;
if (folio_test_anon(folio)) { if (is_anon) {
/* /*
* The caller does not necessarily hold an mmap_lock that would * The caller does not necessarily hold an mmap_lock that would
* prevent the anon_vma disappearing so we first we take a * prevent the anon_vma disappearing so we first we take a
@@ -3495,6 +3520,14 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
if (folio_order(folio) > 1 && if (folio_order(folio) > 1 &&
!list_empty(&folio->_deferred_list)) { !list_empty(&folio->_deferred_list)) {
ds_queue->split_queue_len--; ds_queue->split_queue_len--;
if (folio_test_partially_mapped(folio))
__folio_clear_partially_mapped(folio);
/*
* Reinitialize page_deferred_list after removing the
* page from the split_queue, otherwise a subsequent
* split will see list corruption when checking the
* page_deferred_list.
*/
list_del_init(&folio->_deferred_list); list_del_init(&folio->_deferred_list);
} }
spin_unlock(&ds_queue->split_queue_lock); spin_unlock(&ds_queue->split_queue_lock);
@@ -3522,7 +3555,7 @@ unfreeze:
folio_ref_unfreeze(folio, 1 + extra_pins); folio_ref_unfreeze(folio, 1 + extra_pins);
remap: remap:
free_dst_pages(folio); free_dst_pages(folio);
remap_page(folio, folio_nr_pages(folio)); remap_page(folio, folio_nr_pages(folio), 0);
} }
out_unlock: out_unlock:
@@ -3572,6 +3605,8 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
spin_lock_irqsave(&ds_queue->split_queue_lock, flags); spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
if (!list_empty(&folio->_deferred_list)) { if (!list_empty(&folio->_deferred_list)) {
ds_queue->split_queue_len--; ds_queue->split_queue_len--;
if (folio_test_partially_mapped(folio))
__folio_clear_partially_mapped(folio);
list_del_init(&folio->_deferred_list); list_del_init(&folio->_deferred_list);
unqueued = true; unqueued = true;
} }
@@ -3580,7 +3615,8 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
return unqueued; /* useful for debug warnings */ return unqueued; /* useful for debug warnings */
} }
void deferred_split_folio(struct folio *folio) /* partially_mapped=false won't clear PG_partially_mapped folio flag */
void deferred_split_folio(struct folio *folio, bool partially_mapped)
{ {
struct deferred_split *ds_queue = get_deferred_split_queue(folio); struct deferred_split *ds_queue = get_deferred_split_queue(folio);
#ifdef CONFIG_MEMCG #ifdef CONFIG_MEMCG
@@ -3595,6 +3631,9 @@ void deferred_split_folio(struct folio *folio)
if (folio_order(folio) <= 1) if (folio_order(folio) <= 1)
return; return;
if (!partially_mapped && !split_underused_thp)
return;
/* /*
* Exclude swapcache: originally to avoid a corrupt deferred split * Exclude swapcache: originally to avoid a corrupt deferred split
* queue. Nowadays that is fully prevented by mem_cgroup_swapout(); * queue. Nowadays that is fully prevented by mem_cgroup_swapout();
@@ -3605,13 +3644,20 @@ void deferred_split_folio(struct folio *folio)
if (folio_test_swapcache(folio)) if (folio_test_swapcache(folio))
return; return;
if (!list_empty(&folio->_deferred_list))
return;
spin_lock_irqsave(&ds_queue->split_queue_lock, flags); spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
if (list_empty(&folio->_deferred_list)) { if (partially_mapped) {
if (!folio_test_partially_mapped(folio)) {
__folio_set_partially_mapped(folio);
if (folio_test_pmd_mappable(folio))
count_vm_event(THP_DEFERRED_SPLIT_PAGE); count_vm_event(THP_DEFERRED_SPLIT_PAGE);
count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED); count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
}
} else {
/* partially mapped folios cannot become non-partially mapped */
VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
}
if (list_empty(&folio->_deferred_list)) {
list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
ds_queue->split_queue_len++; ds_queue->split_queue_len++;
#ifdef CONFIG_MEMCG #ifdef CONFIG_MEMCG
@@ -3640,6 +3686,39 @@ static unsigned long deferred_split_count(struct shrinker *shrink,
return READ_ONCE(ds_queue->split_queue_len); return READ_ONCE(ds_queue->split_queue_len);
} }
static bool thp_underused(struct folio *folio)
{
int num_zero_pages = 0, num_filled_pages = 0;
void *kaddr;
int i;
if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1)
return false;
for (i = 0; i < folio_nr_pages(folio); i++) {
kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
num_zero_pages++;
if (num_zero_pages > khugepaged_max_ptes_none) {
kunmap_local(kaddr);
return true;
}
} else {
/*
* Another path for early exit once the number
* of non-zero filled pages exceeds threshold.
*/
num_filled_pages++;
if (num_filled_pages >= HPAGE_PMD_NR - khugepaged_max_ptes_none) {
kunmap_local(kaddr);
return false;
}
}
kunmap_local(kaddr);
}
return false;
}
static unsigned long deferred_split_scan(struct shrinker *shrink, static unsigned long deferred_split_scan(struct shrinker *shrink,
struct shrink_control *sc) struct shrink_control *sc)
{ {
@@ -3647,8 +3726,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
struct deferred_split *ds_queue = &pgdata->deferred_split_queue; struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
unsigned long flags; unsigned long flags;
LIST_HEAD(list); LIST_HEAD(list);
struct folio *folio, *next; struct folio *folio, *next, *prev = NULL;
int split = 0; int split = 0, removed = 0;
#ifdef CONFIG_MEMCG #ifdef CONFIG_MEMCG
if (sc->memcg) if (sc->memcg)
@@ -3663,6 +3742,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
list_move(&folio->_deferred_list, &list); list_move(&folio->_deferred_list, &list);
} else { } else {
/* We lost race with folio_put() */ /* We lost race with folio_put() */
if (folio_test_partially_mapped(folio))
__folio_clear_partially_mapped(folio);
list_del_init(&folio->_deferred_list); list_del_init(&folio->_deferred_list);
ds_queue->split_queue_len--; ds_queue->split_queue_len--;
} }
@@ -3672,20 +3753,55 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
list_for_each_entry_safe(folio, next, &list, _deferred_list) { list_for_each_entry_safe(folio, next, &list, _deferred_list) {
bool did_split = false;
bool underused = false;
if (!folio_test_partially_mapped(folio)) {
underused = thp_underused(folio);
if (!underused)
goto next;
}
if (!folio_trylock(folio)) if (!folio_trylock(folio))
goto next; goto next;
/* split_huge_page() removes page from list on success */ if (!split_folio(folio)) {
if (!split_folio(folio)) did_split = true;
split++; split++;
}
folio_unlock(folio); folio_unlock(folio);
next: next:
/*
* split_folio() removes folio from list on success.
* Only add back to the queue if folio is partially mapped.
* If thp_underused returns false, or if split_folio fails
* in the case it was underused, then consider it used and
* don't add it back to split_queue.
*/
if (did_split) {
; /* folio already removed from list */
} else if (!folio_test_partially_mapped(folio)) {
list_del_init(&folio->_deferred_list);
removed++;
} else {
/*
* That unlocked list_del_init() above would be unsafe,
* unless its folio is separated from any earlier folios
* left on the list (which may be concurrently unqueued)
* by one safe folio with refcount still raised.
*/
swap(folio, prev);
}
if (folio)
folio_put(folio); folio_put(folio);
} }
spin_lock_irqsave(&ds_queue->split_queue_lock, flags); spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
list_splice_tail(&list, &ds_queue->split_queue); list_splice_tail(&list, &ds_queue->split_queue);
ds_queue->split_queue_len -= removed;
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
if (prev)
folio_put(prev);
/* /*
* Stop shrinker if we didn't split any page, but the queue is empty. * Stop shrinker if we didn't split any page, but the queue is empty.
* This can happen if pages were freed under us. * This can happen if pages were freed under us.

View File

@@ -470,7 +470,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
#define K(x) ((x) << (PAGE_SHIFT-10)) #define K(x) ((x) << (PAGE_SHIFT-10))
extern char * const zone_names[MAX_NR_ZONES]; extern char * const zone_names[MAX_NR_ZONES];
extern unsigned long free_highatomics[MAX_NR_ZONES]; extern unsigned long nr_free_highatomic[MAX_NR_ZONES];
/* perform sanity checks on struct pages being allocated or freed */ /* perform sanity checks on struct pages being allocated or freed */
DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled); DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
@@ -721,8 +721,9 @@ extern void post_alloc_hook(struct page *page, unsigned int order,
gfp_t gfp_flags); gfp_t gfp_flags);
extern int user_min_free_kbytes; extern int user_min_free_kbytes;
extern void free_unref_page(struct page *page, unsigned int order); void free_unref_page(struct page *page, unsigned int order);
extern void free_unref_page_list(struct list_head *list); void free_unref_folios(struct folio_batch *fbatch);
void free_unref_page_list(struct list_head *list);
extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_reset(struct zone *zone);
extern void zone_pcp_disable(struct zone *zone); extern void zone_pcp_disable(struct zone *zone);

View File

@@ -84,7 +84,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
* *
* Note that these are only respected if collapse was initiated by khugepaged. * Note that these are only respected if collapse was initiated by khugepaged.
*/ */
static unsigned int khugepaged_max_ptes_none __read_mostly; unsigned int khugepaged_max_ptes_none __read_mostly;
static unsigned int khugepaged_max_ptes_swap __read_mostly; static unsigned int khugepaged_max_ptes_swap __read_mostly;
static unsigned int khugepaged_max_ptes_shared __read_mostly; static unsigned int khugepaged_max_ptes_shared __read_mostly;
@@ -1218,6 +1218,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
pgtable_trans_huge_deposit(mm, pmd, pgtable); pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd); set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd); update_mmu_cache_pmd(vma, address, pmd);
deferred_split_folio(folio, false);
spin_unlock(pmd_ptl); spin_unlock(pmd_ptl);
hpage = NULL; hpage = NULL;

View File

@@ -33,6 +33,7 @@
#include <linux/shmem_fs.h> #include <linux/shmem_fs.h>
#include <linux/hugetlb.h> #include <linux/hugetlb.h>
#include <linux/pagemap.h> #include <linux/pagemap.h>
#include <linux/pagevec.h>
#include <linux/vm_event_item.h> #include <linux/vm_event_item.h>
#include <linux/smp.h> #include <linux/smp.h>
#include <linux/page-flags.h> #include <linux/page-flags.h>
@@ -95,6 +96,9 @@ static bool cgroup_memory_nokmem __ro_after_init;
/* BPF memory accounting disabled? */ /* BPF memory accounting disabled? */
static bool cgroup_memory_nobpf __ro_after_init; static bool cgroup_memory_nobpf __ro_after_init;
static struct kmem_cache *memcg_cachep;
static struct kmem_cache *memcg_pn_cachep;
#ifdef CONFIG_CGROUP_WRITEBACK #ifdef CONFIG_CGROUP_WRITEBACK
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
#endif #endif
@@ -5384,7 +5388,8 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
{ {
struct mem_cgroup_per_node *pn; struct mem_cgroup_per_node *pn;
pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node); pn = kmem_cache_alloc_node(memcg_pn_cachep, GFP_KERNEL | __GFP_ZERO,
node);
if (!pn) if (!pn)
return 1; return 1;
@@ -5440,7 +5445,7 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
int __maybe_unused i; int __maybe_unused i;
long error = -ENOMEM; long error = -ENOMEM;
memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL); memcg = kmem_cache_zalloc(memcg_cachep, GFP_KERNEL);
if (!memcg) if (!memcg)
return ERR_PTR(error); return ERR_PTR(error);
@@ -6017,8 +6022,6 @@ int mem_cgroup_move_account(struct folio *folio,
css_get(&to->css); css_get(&to->css);
css_put(&from->css); css_put(&from->css);
/* Warning should never happen, so don't worry about refcount non-0 */
WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
folio->memcg_data = (unsigned long)to; folio->memcg_data = (unsigned long)to;
__folio_memcg_unlock(from); __folio_memcg_unlock(from);
@@ -6389,9 +6392,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
enum mc_target_type target_type; enum mc_target_type target_type;
union mc_target target; union mc_target target;
struct folio *folio; struct folio *folio;
bool tried_split_before = false;
retry_pmd:
ptl = pmd_trans_huge_lock(pmd, vma); ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) { if (ptl) {
if (mc.precharge < HPAGE_PMD_NR) { if (mc.precharge < HPAGE_PMD_NR) {
@@ -6401,27 +6402,6 @@ retry_pmd:
target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
if (target_type == MC_TARGET_PAGE) { if (target_type == MC_TARGET_PAGE) {
folio = target.folio; folio = target.folio;
/*
* Deferred split queue locking depends on memcg,
* and unqueue is unsafe unless folio refcount is 0:
* split or skip if on the queue? first try to split.
*/
if (!list_empty(&folio->_deferred_list)) {
spin_unlock(ptl);
if (!tried_split_before)
split_folio(folio);
folio_unlock(folio);
folio_put(folio);
if (tried_split_before)
return 0;
tried_split_before = true;
goto retry_pmd;
}
/*
* So long as that pmd lock is held, the folio cannot
* be racily added to the _deferred_list, because
* __folio_remove_rmap() will find !partially_mapped.
*/
if (folio_isolate_lru(folio)) { if (folio_isolate_lru(folio)) {
if (!mem_cgroup_move_account(folio, true, if (!mem_cgroup_move_account(folio, true,
mc.from, mc.to)) { mc.from, mc.to)) {
@@ -7418,6 +7398,18 @@ void __mem_cgroup_uncharge_list(struct list_head *page_list)
uncharge_batch(&ug); uncharge_batch(&ug);
} }
void __mem_cgroup_uncharge_folios(struct folio_batch *folios)
{
struct uncharge_gather ug;
unsigned int i;
uncharge_gather_clear(&ug);
for (i = 0; i < folios->nr; i++)
uncharge_folio(folios->folios[i], &ug);
if (ug.memcg)
uncharge_batch(&ug);
}
/** /**
* mem_cgroup_replace_folio - Charge a folio's replacement. * mem_cgroup_replace_folio - Charge a folio's replacement.
* @old: Currently circulating folio. * @old: Currently circulating folio.
@@ -7606,15 +7598,16 @@ static int __init cgroup_memory(char *s)
__setup("cgroup.memory=", cgroup_memory); __setup("cgroup.memory=", cgroup_memory);
/* /*
* subsys_initcall() for memory controller. * Memory controller init before cgroup_init() initialize root_mem_cgroup.
* *
* Some parts like memcg_hotplug_cpu_dead() have to be initialized from this * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
* context because of lock dependencies (cgroup_lock -> cpu hotplug) but * context because of lock dependencies (cgroup_lock -> cpu hotplug) but
* basically everything that doesn't depend on a specific mem_cgroup structure * basically everything that doesn't depend on a specific mem_cgroup structure
* should be initialized from here. * should be initialized from here.
*/ */
static int __init mem_cgroup_init(void) int __init mem_cgroup_init(void)
{ {
unsigned int memcg_size;
int cpu, node; int cpu, node;
/* /*
@@ -7632,6 +7625,13 @@ static int __init mem_cgroup_init(void)
INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
drain_local_stock); drain_local_stock);
memcg_size = struct_size_t(struct mem_cgroup, nodeinfo, nr_node_ids);
memcg_cachep = kmem_cache_create("mem_cgroup", memcg_size, 0,
SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL);
memcg_pn_cachep = KMEM_CACHE(mem_cgroup_per_node,
SLAB_PANIC | SLAB_HWCACHE_ALIGN);
for_each_node(node) { for_each_node(node) {
struct mem_cgroup_tree_per_node *rtpn; struct mem_cgroup_tree_per_node *rtpn;
@@ -7645,7 +7645,6 @@ static int __init mem_cgroup_init(void)
return 0; return 0;
} }
subsys_initcall(mem_cgroup_init);
#ifdef CONFIG_SWAP #ifdef CONFIG_SWAP
static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)

View File

@@ -19,6 +19,8 @@
#include <linux/mempool.h> #include <linux/mempool.h>
#include <linux/writeback.h> #include <linux/writeback.h>
#include "slab.h" #include "slab.h"
#undef CREATE_TRACE_POINTS
#include <trace/hooks/mm.h>
#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON) #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON)
static void poison_error(mempool_t *pool, void *element, size_t size, static void poison_error(mempool_t *pool, void *element, size_t size,
@@ -383,6 +385,7 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
unsigned long flags; unsigned long flags;
wait_queue_entry_t wait; wait_queue_entry_t wait;
gfp_t gfp_temp; gfp_t gfp_temp;
bool skip_wait = false;
VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
might_alloc(gfp_mask); might_alloc(gfp_mask);
@@ -428,6 +431,11 @@ repeat_alloc:
spin_unlock_irqrestore(&pool->lock, flags); spin_unlock_irqrestore(&pool->lock, flags);
return NULL; return NULL;
} }
trace_android_vh_mempool_alloc_skip_wait(&gfp_temp, &skip_wait);
if (skip_wait) {
spin_unlock_irqrestore(&pool->lock, flags);
goto repeat_alloc;
}
/* Let's wait for someone else to return an element to @pool */ /* Let's wait for someone else to return an element to @pool */
init_wait(&wait); init_wait(&wait);

View File

@@ -182,13 +182,57 @@ void putback_movable_pages(struct list_head *l)
} }
EXPORT_SYMBOL_GPL(putback_movable_pages); EXPORT_SYMBOL_GPL(putback_movable_pages);
static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
struct folio *folio,
unsigned long idx)
{
struct page *page = folio_page(folio, idx);
bool contains_data;
pte_t newpte;
void *addr;
VM_BUG_ON_PAGE(PageCompound(page), page);
VM_BUG_ON_PAGE(!PageAnon(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) ||
mm_forbids_zeropage(pvmw->vma->vm_mm))
return false;
/*
* The pmd entry mapping the old thp was flushed and the pte mapping
* this subpage has been non present. If the subpage is only zero-filled
* then map it to the shared zeropage.
*/
addr = kmap_local_page(page);
contains_data = memchr_inv(addr, 0, PAGE_SIZE);
kunmap_local(addr);
if (contains_data)
return false;
newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
pvmw->vma->vm_page_prot));
set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte);
dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
return true;
}
struct rmap_walk_arg {
struct folio *folio;
bool map_unused_to_zeropage;
};
/* /*
* Restore a potential migration pte to a working pte entry * Restore a potential migration pte to a working pte entry
*/ */
static bool remove_migration_pte(struct folio *dst, static bool remove_migration_pte(struct folio *dst,
struct vm_area_struct *vma, unsigned long addr, void *arg) struct vm_area_struct *vma, unsigned long addr, void *arg)
{ {
struct folio *src = arg; struct rmap_walk_arg *rmap_walk_arg = arg;
struct folio *src = rmap_walk_arg->folio;
DEFINE_FOLIO_VMA_WALK(pvmw, src, vma, addr, PVMW_SYNC | PVMW_MIGRATION); DEFINE_FOLIO_VMA_WALK(pvmw, src, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
while (page_vma_mapped_walk(&pvmw)) { while (page_vma_mapped_walk(&pvmw)) {
@@ -228,6 +272,9 @@ static bool remove_migration_pte(struct folio *dst,
continue; continue;
} }
#endif #endif
if (rmap_walk_arg->map_unused_to_zeropage &&
try_to_map_unused_to_zeropage(&pvmw, folio, idx))
continue;
folio_get(folio); folio_get(folio);
pte = mk_pte(page, READ_ONCE(vma->vm_page_prot)); pte = mk_pte(page, READ_ONCE(vma->vm_page_prot));
@@ -303,14 +350,21 @@ static bool remove_migration_pte(struct folio *dst,
* Get rid of all migration entries and replace them by * Get rid of all migration entries and replace them by
* references to the indicated page. * references to the indicated page.
*/ */
void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked) void remove_migration_ptes(struct folio *src, struct folio *dst, int flags)
{ {
struct rmap_walk_control rwc = { struct rmap_walk_arg rmap_walk_arg = {
.rmap_one = remove_migration_pte, .folio = src,
.arg = src, .map_unused_to_zeropage = flags & RMP_USE_SHARED_ZEROPAGE,
}; };
if (locked) struct rmap_walk_control rwc = {
.rmap_one = remove_migration_pte,
.arg = &rmap_walk_arg,
};
VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src != dst), src);
if (flags & RMP_LOCKED)
rmap_walk_locked(dst, &rwc); rmap_walk_locked(dst, &rwc);
else else
rmap_walk(dst, &rwc); rmap_walk(dst, &rwc);
@@ -461,6 +515,7 @@ int folio_migrate_mapping(struct address_space *mapping,
} }
/* Take off deferred split queue while frozen and memcg set */ /* Take off deferred split queue while frozen and memcg set */
if (folio_test_large(folio) && folio_test_large_rmappable(folio))
folio_unqueue_deferred_split(folio); folio_unqueue_deferred_split(folio);
/* /*
@@ -933,7 +988,7 @@ static int writeout(struct address_space *mapping, struct folio *folio)
* At this point we know that the migration attempt cannot * At this point we know that the migration attempt cannot
* be successful. * be successful.
*/ */
remove_migration_ptes(folio, folio, false); remove_migration_ptes(folio, folio, 0);
rc = mapping->a_ops->writepage(&folio->page, &wbc); rc = mapping->a_ops->writepage(&folio->page, &wbc);
@@ -1096,7 +1151,7 @@ static void migrate_folio_undo_src(struct folio *src,
struct list_head *ret) struct list_head *ret)
{ {
if (page_was_mapped) if (page_was_mapped)
remove_migration_ptes(src, src, false); remove_migration_ptes(src, src, 0);
/* Drop an anon_vma reference if we took one */ /* Drop an anon_vma reference if we took one */
if (anon_vma) if (anon_vma)
put_anon_vma(anon_vma); put_anon_vma(anon_vma);
@@ -1335,7 +1390,7 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
lru_add_drain(); lru_add_drain();
if (old_page_state & PAGE_WAS_MAPPED) if (old_page_state & PAGE_WAS_MAPPED)
remove_migration_ptes(src, dst, false); remove_migration_ptes(src, dst, 0);
out_unlock_both: out_unlock_both:
folio_unlock(dst); folio_unlock(dst);
@@ -1474,7 +1529,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
if (page_was_mapped) if (page_was_mapped)
remove_migration_ptes(src, remove_migration_ptes(src,
rc == MIGRATEPAGE_SUCCESS ? dst : src, false); rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
unlock_put_anon: unlock_put_anon:
folio_unlock(dst); folio_unlock(dst);
@@ -1702,6 +1757,35 @@ static int migrate_pages_batch(struct list_head *from,
cond_resched(); cond_resched();
/*
* The rare folio on the deferred split list should
* be split now. It should not count as a failure:
* but increment nr_failed because, without doing so,
* migrate_pages() may report success with (split but
* unmigrated) pages still on its fromlist; whereas it
* always reports success when its fromlist is empty.
*
* Only check it without removing it from the list.
* Since the folio can be on deferred_split_scan()
* local list and removing it can cause the local list
* corruption. Folio split process below can handle it
* with the help of folio_ref_freeze().
*
* nr_pages > 2 is needed to avoid checking order-1
* page cache folios. They exist, in contrast to
* non-existent order-1 anonymous folios, and do not
* use _deferred_list.
*/
if (nr_pages > 2 &&
!list_empty(&folio->_deferred_list) &&
folio_test_partially_mapped(folio)) {
if (!try_split_folio(folio, split_folios, mode)) {
nr_failed++;
stats->nr_thp_split += is_thp;
continue;
}
}
/* /*
* Large folio migration might be unsupported or * Large folio migration might be unsupported or
* the allocation might be failed so we should retry * the allocation might be failed so we should retry

View File

@@ -422,7 +422,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
continue; continue;
folio = page_folio(page); folio = page_folio(page);
remove_migration_ptes(folio, folio, false); remove_migration_ptes(folio, folio, 0);
src_pfns[i] = 0; src_pfns[i] = 0;
folio_unlock(folio); folio_unlock(folio);
@@ -840,7 +840,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
src = page_folio(page); src = page_folio(page);
dst = page_folio(newpage); dst = page_folio(newpage);
remove_migration_ptes(src, dst, false); remove_migration_ptes(src, dst, 0);
folio_unlock(src); folio_unlock(src);
if (is_zone_device_page(page)) if (is_zone_device_page(page))

View File

@@ -208,8 +208,7 @@ static void mlock_folio_batch(struct folio_batch *fbatch)
if (lruvec) if (lruvec)
unlock_page_lruvec_irq(lruvec); unlock_page_lruvec_irq(lruvec);
folios_put(fbatch->folios, folio_batch_count(fbatch)); folios_put(fbatch);
folio_batch_reinit(fbatch);
} }
void mlock_drain_local(void) void mlock_drain_local(void)

View File

@@ -1558,7 +1558,7 @@ static inline void setup_usemap(struct zone *zone) {}
/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
void __init set_pageblock_order(void) void __init set_pageblock_order(void)
{ {
unsigned int order = MAX_ORDER; unsigned int order = PAGE_BLOCK_ORDER;
/* Check that pageblock_nr_pages has not already been setup */ /* Check that pageblock_nr_pages has not already been setup */
if (pageblock_order) if (pageblock_order)

View File

@@ -33,6 +33,7 @@
#include <linux/sysctl.h> #include <linux/sysctl.h>
#include <linux/cpu.h> #include <linux/cpu.h>
#include <linux/cpuset.h> #include <linux/cpuset.h>
#include <linux/pagevec.h>
#include <linux/memory_hotplug.h> #include <linux/memory_hotplug.h>
#include <linux/nodemask.h> #include <linux/nodemask.h>
#include <linux/vmstat.h> #include <linux/vmstat.h>
@@ -323,7 +324,7 @@ const char * const migratetype_names[MIGRATE_TYPES] = {
#endif #endif
}; };
unsigned long free_highatomics[MAX_NR_ZONES] = {0}; unsigned long nr_free_highatomic[MAX_NR_ZONES] = {0};
int min_free_kbytes = 1024; int min_free_kbytes = 1024;
int user_min_free_kbytes = -1; int user_min_free_kbytes = -1;
@@ -770,8 +771,8 @@ static inline void account_freepages(struct zone *zone, int nr_pages,
if (is_migrate_cma(migratetype)) if (is_migrate_cma(migratetype))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
else if (is_migrate_highatomic(migratetype)) else if (is_migrate_highatomic(migratetype))
WRITE_ONCE(free_highatomics[zone_idx(zone)], WRITE_ONCE(nr_free_highatomic[zone_idx(zone)],
free_highatomics[zone_idx(zone)] + nr_pages); nr_free_highatomic[zone_idx(zone)] + nr_pages);
} }
/* Used for pages not on another list */ /* Used for pages not on another list */
@@ -921,7 +922,6 @@ static inline void __free_one_page(struct page *page,
VM_BUG_ON_PAGE(page->flags & check_flags, page); VM_BUG_ON_PAGE(page->flags & check_flags, page);
VM_BUG_ON(migratetype == -1); VM_BUG_ON(migratetype == -1);
VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
VM_BUG_ON_PAGE(bad_range(zone, page), page); VM_BUG_ON_PAGE(bad_range(zone, page), page);
@@ -1237,6 +1237,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
} }
} }
(page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
trace_android_vh_mm_free_page(page + i);
} }
} }
if (PageMappingFlags(page)) if (PageMappingFlags(page))
@@ -1252,6 +1253,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
page_cpupid_reset_last(page); page_cpupid_reset_last(page);
page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
trace_android_vh_mm_free_page(page);
reset_page_owner(page, order); reset_page_owner(page, order);
free_page_pinner(page, order); free_page_pinner(page, order);
page_table_check_free(page, order); page_table_check_free(page, order);
@@ -1372,7 +1374,6 @@ static void free_one_page(struct zone *zone, struct page *page,
static void __free_pages_ok(struct page *page, unsigned int order, static void __free_pages_ok(struct page *page, unsigned int order,
fpi_t fpi_flags) fpi_t fpi_flags)
{ {
unsigned long flags;
int migratetype; int migratetype;
unsigned long pfn = page_to_pfn(page); unsigned long pfn = page_to_pfn(page);
struct zone *zone = page_zone(page); struct zone *zone = page_zone(page);
@@ -1392,21 +1393,17 @@ skip_prepare:
fpi_flags, &skip_free_pages_ok); fpi_flags, &skip_free_pages_ok);
if (skip_free_pages_ok) if (skip_free_pages_ok)
return; return;
/*
spin_lock_irqsave(&zone->lock, flags); * Calling get_pfnblock_migratetype() without spin_lock_irqsave() here
* is used to avoid calling get_pfnblock_migratetype() under the lock.
* This will reduce the lock holding time.
*/
migratetype = get_pfnblock_migratetype(page, pfn); migratetype = get_pfnblock_migratetype(page, pfn);
trace_android_vh_free_unref_page_bypass(page, order, migratetype, &skip_free_unref_page); trace_android_vh_free_unref_page_bypass(page, order, migratetype, &skip_free_unref_page);
if (skip_free_unref_page) { if (skip_free_unref_page)
spin_unlock_irqrestore(&zone->lock, flags);
return; return;
}
if (unlikely(has_isolate_pageblock(zone) || free_one_page(zone, page, pfn, order, fpi_flags);
is_migrate_isolate(migratetype))) {
migratetype = get_pfnblock_migratetype(page, pfn);
}
__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
spin_unlock_irqrestore(&zone->lock, flags);
__count_vm_events(PGFREE, 1 << order); __count_vm_events(PGFREE, 1 << order);
} }
@@ -2249,8 +2246,8 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
struct zone *zone; struct zone *zone;
struct page *page; struct page *page;
int order; int order;
int ret;
bool skip_unreserve_highatomic = false; bool skip_unreserve_highatomic = false;
int ret;
for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
ac->nodemask) { ac->nodemask) {
@@ -2787,58 +2784,59 @@ void free_unref_page(struct page *page, unsigned int order)
} }
/* /*
* Free a list of 0-order pages * Free a batch of folios
*/ */
void free_unref_page_list(struct list_head *list) void free_unref_folios(struct folio_batch *folios)
{ {
unsigned long __maybe_unused UP_flags; unsigned long __maybe_unused UP_flags;
struct page *page, *next;
struct per_cpu_pages *pcp = NULL; struct per_cpu_pages *pcp = NULL;
struct zone *locked_zone = NULL; struct zone *locked_zone = NULL;
int batch_count = 0; int i, j;
/* Prepare folios for freeing */
for (i = 0, j = 0; i < folios->nr; i++) {
struct folio *folio = folios->folios[i];
unsigned long pfn = folio_pfn(folio);
unsigned int order = folio_order(folio);
if (order > 0 && folio_test_large_rmappable(folio))
folio_unqueue_deferred_split(folio);
if (!free_pages_prepare(&folio->page, order, FPI_NONE))
continue;
/*
* Free orders not handled on the PCP directly to the
* allocator.
*/
if (!pcp_allowed_order(order)) {
free_one_page(folio_zone(folio), &folio->page,
pfn, order, FPI_NONE);
continue;
}
folio->private = (void *)(unsigned long)order;
if (j != i)
folios->folios[j] = folio;
j++;
}
folios->nr = j;
for (i = 0; i < folios->nr; i++) {
struct folio *folio = folios->folios[i];
struct zone *zone = folio_zone(folio);
unsigned long pfn = folio_pfn(folio);
unsigned int order = (unsigned long)folio->private;
int migratetype; int migratetype;
bool skip_free = false;
/* Prepare pages for freeing */ folio->private = NULL;
list_for_each_entry_safe(page, next, list, lru) { migratetype = get_pfnblock_migratetype(&folio->page, pfn);
unsigned long pfn = page_to_pfn(page);
if (!free_pages_prepare(page, 0, FPI_NONE)) {
list_del(&page->lru);
continue;
}
/* /* Different zone requires a different pcp lock */
* Free isolated pages directly to the allocator, see if (zone != locked_zone ||
* comment in free_unref_page. is_migrate_isolate(migratetype)) {
*/
migratetype = get_pfnblock_migratetype(page, pfn);
if (unlikely(is_migrate_isolate(migratetype))) {
list_del(&page->lru);
free_one_page(page_zone(page), page, pfn, 0, FPI_NONE);
continue;
}
}
trace_android_vh_free_unref_page_list_bypass(list, &skip_free);
if (skip_free)
return;
list_for_each_entry_safe(page, next, list, lru) {
struct zone *zone = page_zone(page);
unsigned long pfn = page_to_pfn(page);
list_del(&page->lru);
migratetype = get_pfnblock_migratetype(page, pfn);
/*
* Either different zone requiring a different pcp lock or
* excessive lock hold times when freeing a large list of
* pages.
*/
if (zone != locked_zone || batch_count == SWAP_CLUSTER_MAX) {
if (pcp) { if (pcp) {
pcp_spin_unlock(pcp); pcp_spin_unlock(pcp);
pcp_trylock_finish(UP_flags); pcp_trylock_finish(UP_flags);
locked_zone = NULL;
pcp = NULL;
} }
/* /*
@@ -2846,24 +2844,21 @@ void free_unref_page_list(struct list_head *list)
* allocator, see comment in free_unref_page. * allocator, see comment in free_unref_page.
*/ */
if (is_migrate_isolate(migratetype)) { if (is_migrate_isolate(migratetype)) {
free_one_page(zone, page, page_to_pfn(page), free_one_page(zone, &folio->page, pfn,
0, FPI_NONE); order, FPI_NONE);
continue; continue;
} }
batch_count = 0;
/* /*
* trylock is necessary as pages may be getting freed * trylock is necessary as folios may be getting freed
* from IRQ or SoftIRQ context after an IO completion. * from IRQ or SoftIRQ context after an IO completion.
*/ */
pcp_trylock_prepare(UP_flags); pcp_trylock_prepare(UP_flags);
pcp = pcp_spin_trylock(zone->per_cpu_pageset); pcp = pcp_spin_trylock(zone->per_cpu_pageset);
if (unlikely(!pcp)) { if (unlikely(!pcp)) {
pcp_trylock_finish(UP_flags); pcp_trylock_finish(UP_flags);
free_one_page(zone, page, pfn, free_one_page(zone, &folio->page, pfn,
0, FPI_NONE); order, FPI_NONE);
locked_zone = NULL;
continue; continue;
} }
locked_zone = zone; locked_zone = zone;
@@ -2880,15 +2875,39 @@ void free_unref_page_list(struct list_head *list)
migratetype = MIGRATE_MOVABLE; migratetype = MIGRATE_MOVABLE;
} }
trace_mm_page_free_batched(page); trace_mm_page_free_batched(&folio->page);
free_unref_page_commit(zone, pcp, page, migratetype, 0); free_unref_page_commit(zone, pcp, &folio->page, migratetype,
batch_count++; order);
} }
if (pcp) { if (pcp) {
pcp_spin_unlock(pcp); pcp_spin_unlock(pcp);
pcp_trylock_finish(UP_flags); pcp_trylock_finish(UP_flags);
} }
folio_batch_reinit(folios);
}
void free_unref_page_list(struct list_head *list)
{
struct folio_batch fbatch;
bool skip_free = false;
trace_android_vh_free_unref_page_list_bypass(list, &skip_free);
if (skip_free)
return;
folio_batch_init(&fbatch);
while (!list_empty(list)) {
struct folio *folio = list_first_entry(list, struct folio, lru);
list_del(&folio->lru);
if (folio_batch_add(&fbatch, folio) > 0)
continue;
free_unref_folios(&fbatch);
}
if (fbatch.nr)
free_unref_folios(&fbatch);
} }
/* /*
@@ -3216,7 +3235,7 @@ static inline long __zone_watermark_unusable_free(struct zone *z,
* watermark then subtract the free pages reserved for highatomic. * watermark then subtract the free pages reserved for highatomic.
*/ */
if (likely(!(alloc_flags & ALLOC_RESERVES))) if (likely(!(alloc_flags & ALLOC_RESERVES)))
unusable_free += READ_ONCE(free_highatomics[zone_idx(z)]); unusable_free += READ_ONCE(nr_free_highatomic[zone_idx(z)]);
#ifdef CONFIG_CMA #ifdef CONFIG_CMA
/* If allocation can't use CMA areas don't use free CMA pages */ /* If allocation can't use CMA areas don't use free CMA pages */

View File

@@ -417,9 +417,9 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
ret = __alloc_contig_migrate_range(&cc, head_pfn, ret = __alloc_contig_migrate_range(&cc, head_pfn,
head_pfn + nr_pages, page_mt); head_pfn + nr_pages, page_mt);
if (ret) if (ret)
goto failed; goto failed;
pfn = head_pfn + nr_pages; pfn = head_pfn + nr_pages;
continue; continue;
} }

View File

@@ -270,6 +270,9 @@ static const struct vm_operations_struct pad_vma_ops = {
.name = pad_vma_name, .name = pad_vma_name,
}; };
/* Defined in kernel/fork.c */
extern struct kmem_cache *vm_area_cachep;
/* /*
* Returns a new VMA representing the padding in @vma; * Returns a new VMA representing the padding in @vma;
* returns NULL if no padding in @vma or allocation failed. * returns NULL if no padding in @vma or allocation failed.
@@ -281,7 +284,7 @@ static struct vm_area_struct *get_pad_vma(struct vm_area_struct *vma)
if (!is_pgsize_migration_enabled() || !(vma->vm_flags & VM_PAD_MASK)) if (!is_pgsize_migration_enabled() || !(vma->vm_flags & VM_PAD_MASK))
return NULL; return NULL;
pad = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL); pad = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (!pad) { if (!pad) {
pr_warn("Page size migration: Failed to allocate padding VMA"); pr_warn("Page size migration: Failed to allocate padding VMA");
return NULL; return NULL;
@@ -347,7 +350,7 @@ void show_map_pad_vma(struct vm_area_struct *vma, struct seq_file *m,
else else
((show_pad_maps_fn)func)(m, pad); ((show_pad_maps_fn)func)(m, pad);
kfree(pad); kmem_cache_free(vm_area_cachep, pad);
} }
/* /*

View File

@@ -1599,8 +1599,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
* Check partially_mapped first to ensure it is a large folio. * Check partially_mapped first to ensure it is a large folio.
*/ */
if (folio_test_anon(folio) && partially_mapped && if (folio_test_anon(folio) && partially_mapped &&
list_empty(&folio->_deferred_list)) !folio_test_partially_mapped(folio))
deferred_split_folio(folio); deferred_split_folio(folio, true);
} }
/* /*

View File

@@ -342,7 +342,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
K(low_wmark_pages(zone)), K(low_wmark_pages(zone)),
K(high_wmark_pages(zone)), K(high_wmark_pages(zone)),
K(zone->nr_reserved_highatomic), K(zone->nr_reserved_highatomic),
K(free_highatomics[zone_idx(zone)]), K(nr_free_highatomic[zone_idx(zone)]),
K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),

169
mm/swap.c
View File

@@ -77,26 +77,33 @@ static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = {
.lock = INIT_LOCAL_LOCK(lock), .lock = INIT_LOCAL_LOCK(lock),
}; };
static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp,
unsigned long *flagsp)
{
if (folio_test_lru(folio)) {
folio_lruvec_relock_irqsave(folio, lruvecp, flagsp);
lruvec_del_folio(*lruvecp, folio);
__folio_clear_lru_flags(folio);
}
}
/* /*
* This path almost never happens for VM activity - pages are normally freed * This path almost never happens for VM activity - pages are normally freed
* in batches. But it gets used by networking - and for compound pages. * in batches. But it gets used by networking - and for compound pages.
*/ */
static void __page_cache_release(struct folio *folio) static void page_cache_release(struct folio *folio)
{ {
if (folio_test_lru(folio)) { struct lruvec *lruvec = NULL;
struct lruvec *lruvec;
unsigned long flags; unsigned long flags;
lruvec = folio_lruvec_lock_irqsave(folio, &flags); __page_cache_release(folio, &lruvec, &flags);
lruvec_del_folio(lruvec, folio); if (lruvec)
__folio_clear_lru_flags(folio);
unlock_page_lruvec_irqrestore(lruvec, flags); unlock_page_lruvec_irqrestore(lruvec, flags);
}
} }
static void __folio_put_small(struct folio *folio) static void __folio_put_small(struct folio *folio)
{ {
__page_cache_release(folio); page_cache_release(folio);
mem_cgroup_uncharge(folio); mem_cgroup_uncharge(folio);
free_unref_page(&folio->page, 0); free_unref_page(&folio->page, 0);
} }
@@ -110,7 +117,7 @@ static void __folio_put_large(struct folio *folio)
* be called for hugetlb (it has a separate hugetlb_cgroup.) * be called for hugetlb (it has a separate hugetlb_cgroup.)
*/ */
if (!folio_test_hugetlb(folio)) if (!folio_test_hugetlb(folio))
__page_cache_release(folio); page_cache_release(folio);
destroy_large_folio(folio); destroy_large_folio(folio);
} }
@@ -133,22 +140,25 @@ EXPORT_SYMBOL(__folio_put);
*/ */
void put_pages_list(struct list_head *pages) void put_pages_list(struct list_head *pages)
{ {
struct folio_batch fbatch;
struct folio *folio, *next; struct folio *folio, *next;
folio_batch_init(&fbatch);
list_for_each_entry_safe(folio, next, pages, lru) { list_for_each_entry_safe(folio, next, pages, lru) {
if (!folio_put_testzero(folio)) { if (!folio_put_testzero(folio))
list_del(&folio->lru);
continue; continue;
}
if (folio_test_large(folio)) { if (folio_test_large(folio)) {
list_del(&folio->lru);
__folio_put_large(folio); __folio_put_large(folio);
continue; continue;
} }
/* LRU flag must be clear because it's passed using the lru */ /* LRU flag must be clear because it's passed using the lru */
if (folio_batch_add(&fbatch, folio) > 0)
continue;
free_unref_folios(&fbatch);
} }
free_unref_page_list(pages); if (fbatch.nr)
free_unref_folios(&fbatch);
INIT_LIST_HEAD(pages); INIT_LIST_HEAD(pages);
} }
EXPORT_SYMBOL(put_pages_list); EXPORT_SYMBOL(put_pages_list);
@@ -170,7 +180,7 @@ static void lru_add_fn(struct lruvec *lruvec, struct folio *folio)
* while the LRU lock is held. * while the LRU lock is held.
* *
* (That is not true of __page_cache_release(), and not necessarily * (That is not true of __page_cache_release(), and not necessarily
* true of release_pages(): but those only clear the mlocked flag after * true of folios_put(): but those only clear the mlocked flag after
* folio_put_testzero() has excluded any other users of the folio.) * folio_put_testzero() has excluded any other users of the folio.)
*/ */
if (folio_evictable(folio)) { if (folio_evictable(folio)) {
@@ -208,7 +218,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
if (move_fn != lru_add_fn && !folio_test_clear_lru(folio)) if (move_fn != lru_add_fn && !folio_test_clear_lru(folio))
continue; continue;
lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); folio_lruvec_relock_irqsave(folio, &lruvec, &flags);
move_fn(lruvec, folio); move_fn(lruvec, folio);
folio_set_lru(folio); folio_set_lru(folio);
@@ -216,8 +226,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
if (lruvec) if (lruvec)
unlock_page_lruvec_irqrestore(lruvec, flags); unlock_page_lruvec_irqrestore(lruvec, flags);
folios_put(fbatch->folios, folio_batch_count(fbatch)); folios_put(fbatch);
folio_batch_reinit(fbatch);
} }
static void folio_batch_add_and_move(struct folio_batch *fbatch, static void folio_batch_add_and_move(struct folio_batch *fbatch,
@@ -958,47 +967,29 @@ void lru_cache_disable(void)
EXPORT_SYMBOL_GPL(lru_cache_disable); EXPORT_SYMBOL_GPL(lru_cache_disable);
/** /**
* release_pages - batched put_page() * folios_put_refs - Reduce the reference count on a batch of folios.
* @arg: array of pages to release * @folios: The folios.
* @nr: number of pages * @refs: The number of refs to subtract from each folio.
* *
* Decrement the reference count on all the pages in @arg. If it * Like folio_put(), but for a batch of folios. This is more efficient
* fell to zero, remove the page from the LRU and free it. * than writing the loop yourself as it will optimise the locks which need
* to be taken if the folios are freed. The folios batch is returned
* empty and ready to be reused for another batch; there is no need
* to reinitialise it. If @refs is NULL, we subtract one from each
* folio refcount.
* *
* Note that the argument can be an array of pages, encoded pages, * Context: May be called in process or interrupt context, but not in NMI
* or folio pointers. We ignore any encoded bits, and turn any of * context. May be called while holding a spinlock.
* them into just a folio that gets free'd.
*/ */
void release_pages(release_pages_arg arg, int nr) void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
{ {
int i; int i, j;
struct encoded_page **encoded = arg.encoded_pages;
LIST_HEAD(pages_to_free);
struct lruvec *lruvec = NULL; struct lruvec *lruvec = NULL;
unsigned long flags = 0; unsigned long flags = 0;
unsigned int lock_batch;
for (i = 0; i < nr; i++) { for (i = 0, j = 0; i < folios->nr; i++) {
unsigned int nr_refs = 1; struct folio *folio = folios->folios[i];
struct folio *folio; unsigned int nr_refs = refs ? refs[i] : 1;
/* Turn any of the argument types into a folio */
folio = page_folio(encoded_page_ptr(encoded[i]));
/* Is our next entry actually "nr_pages" -> "nr_refs" ? */
if (unlikely(encoded_page_flags(encoded[i]) &
ENCODED_PAGE_BIT_NR_PAGES_NEXT))
nr_refs = encoded_nr_pages(encoded[++i]);
/*
* Make sure the IRQ-safe lock-holding time does not get
* excessive with a continuous string of pages from the
* same lruvec. The lock is held only if lruvec != NULL.
*/
if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) {
unlock_page_lruvec_irqrestore(lruvec, flags);
lruvec = NULL;
}
if (is_huge_zero_page(&folio->page)) if (is_huge_zero_page(&folio->page))
continue; continue;
@@ -1018,34 +1009,73 @@ void release_pages(release_pages_arg arg, int nr)
if (!folio_ref_sub_and_test(folio, nr_refs)) if (!folio_ref_sub_and_test(folio, nr_refs))
continue; continue;
if (folio_test_large(folio)) { /* hugetlb has its own memcg */
if (folio_test_hugetlb(folio)) {
if (lruvec) { if (lruvec) {
unlock_page_lruvec_irqrestore(lruvec, flags); unlock_page_lruvec_irqrestore(lruvec, flags);
lruvec = NULL; lruvec = NULL;
} }
__folio_put_large(folio); free_huge_folio(folio);
continue; continue;
} }
if (folio_test_lru(folio)) { folio_unqueue_deferred_split(folio);
struct lruvec *prev_lruvec = lruvec; __page_cache_release(folio, &lruvec, &flags);
lruvec = folio_lruvec_relock_irqsave(folio, lruvec, if (j != i)
&flags); folios->folios[j] = folio;
if (prev_lruvec != lruvec) j++;
lock_batch = 0;
lruvec_del_folio(lruvec, folio);
__folio_clear_lru_flags(folio);
}
list_add(&folio->lru, &pages_to_free);
} }
if (lruvec) if (lruvec)
unlock_page_lruvec_irqrestore(lruvec, flags); unlock_page_lruvec_irqrestore(lruvec, flags);
if (!j) {
folio_batch_reinit(folios);
return;
}
mem_cgroup_uncharge_list(&pages_to_free); folios->nr = j;
free_unref_page_list(&pages_to_free); mem_cgroup_uncharge_folios(folios);
free_unref_folios(folios);
}
EXPORT_SYMBOL(folios_put_refs);
/**
* release_pages - batched put_page()
* @arg: array of pages to release
* @nr: number of pages
*
* Decrement the reference count on all the pages in @arg. If it
* fell to zero, remove the page from the LRU and free it.
*
* Note that the argument can be an array of pages, encoded pages,
* or folio pointers. We ignore any encoded bits, and turn any of
* them into just a folio that gets free'd.
*/
void release_pages(release_pages_arg arg, int nr)
{
struct folio_batch fbatch;
int refs[PAGEVEC_SIZE];
struct encoded_page **encoded = arg.encoded_pages;
int i;
folio_batch_init(&fbatch);
for (i = 0; i < nr; i++) {
/* Turn any of the argument types into a folio */
struct folio *folio = page_folio(encoded_page_ptr(encoded[i]));
/* Is our next entry actually "nr_pages" -> "nr_refs" ? */
refs[fbatch.nr] = 1;
if (unlikely(encoded_page_flags(encoded[i]) &
ENCODED_PAGE_BIT_NR_PAGES_NEXT))
refs[fbatch.nr] = encoded_nr_pages(encoded[++i]);
if (folio_batch_add(&fbatch, folio) > 0)
continue;
folios_put_refs(&fbatch, refs);
}
if (fbatch.nr)
folios_put_refs(&fbatch, refs);
} }
EXPORT_SYMBOL(release_pages); EXPORT_SYMBOL(release_pages);
@@ -1065,8 +1095,7 @@ void __folio_batch_release(struct folio_batch *fbatch)
lru_add_drain(); lru_add_drain();
fbatch->percpu_pvec_drained = true; fbatch->percpu_pvec_drained = true;
} }
release_pages(fbatch->folios, folio_batch_count(fbatch)); folios_put(fbatch);
folio_batch_reinit(fbatch);
} }
EXPORT_SYMBOL(__folio_batch_release); EXPORT_SYMBOL(__folio_batch_release);

View File

@@ -1358,6 +1358,7 @@ const char * const vmstat_text[] = {
"thp_split_page", "thp_split_page",
"thp_split_page_failed", "thp_split_page_failed",
"thp_deferred_split_page", "thp_deferred_split_page",
"thp_underused_split_page",
"thp_split_pmd", "thp_split_pmd",
"thp_shatter_page", "thp_shatter_page",
"thp_shatter_page_failed", "thp_shatter_page_failed",

View File

@@ -17,7 +17,7 @@ obj-$(CONFIG_NETFILTER) += netfilter/
obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_INET) += ipv4/
obj-$(CONFIG_TLS) += tls/ obj-$(CONFIG_TLS) += tls/
obj-$(CONFIG_XFRM) += xfrm/ obj-$(CONFIG_XFRM) += xfrm/
obj-$(CONFIG_UNIX_SCM) += unix/ obj-$(CONFIG_UNIX) += unix/
obj-y += ipv6/ obj-y += ipv6/
obj-$(CONFIG_BPFILTER) += bpfilter/ obj-$(CONFIG_BPFILTER) += bpfilter/
obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_PACKET) += packet/

View File

@@ -36,6 +36,7 @@
#include <net/compat.h> #include <net/compat.h>
#include <net/scm.h> #include <net/scm.h>
#include <net/cls_cgroup.h> #include <net/cls_cgroup.h>
#include <net/af_unix.h>
/* /*
@@ -85,8 +86,15 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
return -ENOMEM; return -ENOMEM;
*fplp = fpl; *fplp = fpl;
fpl->count = 0; fpl->count = 0;
fpl->count_unix = 0;
fpl->max = SCM_MAX_FD; fpl->max = SCM_MAX_FD;
fpl->user = NULL; fpl->user = NULL;
#if IS_ENABLED(CONFIG_UNIX)
fpl->inflight = false;
fpl->dead = false;
fpl->edges = NULL;
INIT_LIST_HEAD(&fpl->vertices);
#endif
} }
fpp = &fpl->fp[fpl->count]; fpp = &fpl->fp[fpl->count];
@@ -109,6 +117,9 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
fput(file); fput(file);
return -EINVAL; return -EINVAL;
} }
if (unix_get_socket(file))
fpl->count_unix++;
*fpp++ = file; *fpp++ = file;
fpl->count++; fpl->count++;
} }
@@ -366,13 +377,18 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
if (!fpl) if (!fpl)
return NULL; return NULL;
new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]), new_fpl = kmemdup(fpl, sizeof(*fpl),
GFP_KERNEL_ACCOUNT); GFP_KERNEL_ACCOUNT);
if (new_fpl) { if (new_fpl) {
for (i = 0; i < fpl->count; i++) for (i = 0; i < fpl->count; i++)
get_file(fpl->fp[i]); get_file(fpl->fp[i]);
new_fpl->max = new_fpl->count; new_fpl->max = new_fpl->count;
new_fpl->user = get_uid(fpl->user); new_fpl->user = get_uid(fpl->user);
#if IS_ENABLED(CONFIG_UNIX)
new_fpl->inflight = false;
new_fpl->edges = NULL;
INIT_LIST_HEAD(&new_fpl->vertices);
#endif
} }
return new_fpl; return new_fpl;
} }

View File

@@ -16,11 +16,6 @@ config UNIX
Say Y unless you know what you are doing. Say Y unless you know what you are doing.
config UNIX_SCM
bool
depends on UNIX
default y
config AF_UNIX_OOB config AF_UNIX_OOB
bool bool
depends on UNIX depends on UNIX

View File

@@ -11,5 +11,3 @@ unix-$(CONFIG_BPF_SYSCALL) += unix_bpf.o
obj-$(CONFIG_UNIX_DIAG) += unix_diag.o obj-$(CONFIG_UNIX_DIAG) += unix_diag.o
unix_diag-y := diag.o unix_diag-y := diag.o
obj-$(CONFIG_UNIX_SCM) += scm.o

View File

@@ -117,8 +117,6 @@
#include <linux/file.h> #include <linux/file.h>
#include <linux/btf_ids.h> #include <linux/btf_ids.h>
#include "scm.h"
static atomic_long_t unix_nr_socks; static atomic_long_t unix_nr_socks;
static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
@@ -980,11 +978,11 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern,
sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen); sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen);
sk->sk_destruct = unix_sock_destructor; sk->sk_destruct = unix_sock_destructor;
u = unix_sk(sk); u = unix_sk(sk);
u->inflight = 0; u->listener = NULL;
u->vertex = NULL;
u->path.dentry = NULL; u->path.dentry = NULL;
u->path.mnt = NULL; u->path.mnt = NULL;
spin_lock_init(&u->lock); spin_lock_init(&u->lock);
INIT_LIST_HEAD(&u->link);
mutex_init(&u->iolock); /* single task reading lock */ mutex_init(&u->iolock); /* single task reading lock */
mutex_init(&u->bindlock); /* single task binding lock */ mutex_init(&u->bindlock); /* single task binding lock */
init_waitqueue_head(&u->peer_wait); init_waitqueue_head(&u->peer_wait);
@@ -1583,6 +1581,7 @@ restart:
newsk->sk_type = sk->sk_type; newsk->sk_type = sk->sk_type;
init_peercred(newsk); init_peercred(newsk);
newu = unix_sk(newsk); newu = unix_sk(newsk);
newu->listener = other;
RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
otheru = unix_sk(other); otheru = unix_sk(other);
@@ -1678,8 +1677,8 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
bool kern) bool kern)
{ {
struct sock *sk = sock->sk; struct sock *sk = sock->sk;
struct sock *tsk;
struct sk_buff *skb; struct sk_buff *skb;
struct sock *tsk;
int err; int err;
err = -EOPNOTSUPP; err = -EOPNOTSUPP;
@@ -1709,6 +1708,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
/* attach accepted sock to socket */ /* attach accepted sock to socket */
unix_state_lock(tsk); unix_state_lock(tsk);
unix_update_edges(unix_sk(tsk));
newsock->state = SS_CONNECTED; newsock->state = SS_CONNECTED;
unix_sock_inherit_flags(sock, newsock); unix_sock_inherit_flags(sock, newsock);
sock_graft(tsk, newsock); sock_graft(tsk, newsock);
@@ -1752,51 +1752,65 @@ out:
return err; return err;
} }
/* The "user->unix_inflight" variable is protected by the garbage
* collection lock, and we just read it locklessly here. If you go
* over the limit, there might be a tiny race in actually noticing
* it across threads. Tough.
*/
static inline bool too_many_unix_fds(struct task_struct *p)
{
struct user_struct *user = current_user();
if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
return false;
}
static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
{
if (too_many_unix_fds(current))
return -ETOOMANYREFS;
/* Need to duplicate file references for the sake of garbage
* collection. Otherwise a socket in the fps might become a
* candidate for GC while the skb is not yet queued.
*/
UNIXCB(skb).fp = scm_fp_dup(scm->fp);
if (!UNIXCB(skb).fp)
return -ENOMEM;
if (unix_prepare_fpl(UNIXCB(skb).fp))
return -ENOMEM;
return 0;
}
static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
{
scm->fp = UNIXCB(skb).fp;
UNIXCB(skb).fp = NULL;
unix_destroy_fpl(scm->fp);
}
static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
{ {
scm->fp = scm_fp_dup(UNIXCB(skb).fp); scm->fp = scm_fp_dup(UNIXCB(skb).fp);
}
/* static void unix_destruct_scm(struct sk_buff *skb)
* Garbage collection of unix sockets starts by selecting a set of {
* candidate sockets which have reference only from being in flight struct scm_cookie scm;
* (total_refs == inflight_refs). This condition is checked once during
* the candidate collection phase, and candidates are marked as such, so memset(&scm, 0, sizeof(scm));
* that non-candidates can later be ignored. While inflight_refs is scm.pid = UNIXCB(skb).pid;
* protected by unix_gc_lock, total_refs (file count) is not, hence this if (UNIXCB(skb).fp)
* is an instantaneous decision. unix_detach_fds(&scm, skb);
*
* Once a candidate, however, the socket must not be reinstalled into a /* Alas, it calls VFS */
* file descriptor while the garbage collection is in progress. /* So fscking what? fput() had been SMP-safe since the last Summer */
* scm_destroy(&scm);
* If the above conditions are met, then the directed graph of sock_wfree(skb);
* candidates (*) does not change while unix_gc_lock is held.
*
* Any operations that changes the file count through file descriptors
* (dup, close, sendmsg) does not change the graph since candidates are
* not installed in fds.
*
* Dequeing a candidate via recvmsg would install it into an fd, but
* that takes unix_gc_lock to decrement the inflight count, so it's
* serialized with garbage collection.
*
* MSG_PEEK is special in that it does not change the inflight count,
* yet does install the socket into an fd. The following lock/unlock
* pair is to ensure serialization with garbage collection. It must be
* done between incrementing the file count and installing the file into
* an fd.
*
* If garbage collection starts after the barrier provided by the
* lock/unlock, then it will see the elevated refcount and not mark this
* as a candidate. If a garbage collection is already in progress
* before the file count was incremented, then the lock/unlock pair will
* ensure that garbage collection is finished before progressing to
* installing the fd.
*
* (*) A -> B where B is on the queue of A or B is on the queue of C
* which is on the queue of listening socket A.
*/
spin_lock(&unix_gc_lock);
spin_unlock(&unix_gc_lock);
} }
static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
@@ -1855,8 +1869,10 @@ static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
struct scm_fp_list *fp = UNIXCB(skb).fp; struct scm_fp_list *fp = UNIXCB(skb).fp;
struct unix_sock *u = unix_sk(sk); struct unix_sock *u = unix_sk(sk);
if (unlikely(fp && fp->count)) if (unlikely(fp && fp->count)) {
atomic_add(fp->count, &u->scm_stat.nr_fds); atomic_add(fp->count, &u->scm_stat.nr_fds);
unix_add_edges(fp, u);
}
} }
static void scm_stat_del(struct sock *sk, struct sk_buff *skb) static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
@@ -1864,8 +1880,10 @@ static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
struct scm_fp_list *fp = UNIXCB(skb).fp; struct scm_fp_list *fp = UNIXCB(skb).fp;
struct unix_sock *u = unix_sk(sk); struct unix_sock *u = unix_sk(sk);
if (unlikely(fp && fp->count)) if (unlikely(fp && fp->count)) {
atomic_sub(fp->count, &u->scm_stat.nr_fds); atomic_sub(fp->count, &u->scm_stat.nr_fds);
unix_del_edges(fp);
}
} }
/* /*
@@ -1885,11 +1903,12 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
long timeo; long timeo;
int err; int err;
wait_for_unix_gc();
err = scm_send(sock, msg, &scm, false); err = scm_send(sock, msg, &scm, false);
if (err < 0) if (err < 0)
return err; return err;
wait_for_unix_gc(scm.fp);
err = -EOPNOTSUPP; err = -EOPNOTSUPP;
if (msg->msg_flags&MSG_OOB) if (msg->msg_flags&MSG_OOB)
goto out; goto out;
@@ -2157,11 +2176,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
bool fds_sent = false; bool fds_sent = false;
int data_len; int data_len;
wait_for_unix_gc();
err = scm_send(sock, msg, &scm, false); err = scm_send(sock, msg, &scm, false);
if (err < 0) if (err < 0)
return err; return err;
wait_for_unix_gc(scm.fp);
err = -EOPNOTSUPP; err = -EOPNOTSUPP;
if (msg->msg_flags & MSG_OOB) { if (msg->msg_flags & MSG_OOB) {
#if IS_ENABLED(CONFIG_AF_UNIX_OOB) #if IS_ENABLED(CONFIG_AF_UNIX_OOB)

View File

@@ -81,249 +81,519 @@
#include <net/scm.h> #include <net/scm.h>
#include <net/tcp_states.h> #include <net/tcp_states.h>
#include "scm.h" struct unix_sock *unix_get_socket(struct file *filp)
/* Internal data structures and random procedures: */
static LIST_HEAD(gc_candidates);
static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
struct sk_buff_head *hitlist)
{ {
struct sk_buff *skb; struct inode *inode = file_inode(filp);
struct sk_buff *next;
spin_lock(&x->sk_receive_queue.lock); /* Socket ? */
skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) {
/* Do we have file descriptors ? */ struct socket *sock = SOCKET_I(inode);
if (UNIXCB(skb).fp) { const struct proto_ops *ops;
bool hit = false; struct sock *sk = sock->sk;
/* Process the descriptors of this socket */
int nfd = UNIXCB(skb).fp->count;
struct file **fp = UNIXCB(skb).fp->fp;
while (nfd--) { ops = READ_ONCE(sock->ops);
/* Get the socket the fd matches if it indeed does so */
struct unix_sock *u = unix_get_socket(*fp++);
/* Ignore non-candidates, they could have been added /* PF_UNIX ? */
* to the queues after starting the garbage collection if (sk && ops && ops->family == PF_UNIX)
*/ return unix_sk(sk);
if (u && test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { }
hit = true;
func(u); return NULL;
}
}
if (hit && hitlist != NULL) {
__skb_unlink(skb, &x->sk_receive_queue);
__skb_queue_tail(hitlist, skb);
}
}
}
spin_unlock(&x->sk_receive_queue.lock);
} }
static void scan_children(struct sock *x, void (*func)(struct unix_sock *), static struct unix_vertex *unix_edge_successor(struct unix_edge *edge)
struct sk_buff_head *hitlist)
{ {
if (x->sk_state != TCP_LISTEN) { /* If an embryo socket has a fd,
scan_inflight(x, func, hitlist); * the listener indirectly holds the fd's refcnt.
*/
if (edge->successor->listener)
return unix_sk(edge->successor->listener)->vertex;
return edge->successor->vertex;
}
static bool unix_graph_maybe_cyclic;
static bool unix_graph_grouped;
static void unix_update_graph(struct unix_vertex *vertex)
{
/* If the receiver socket is not inflight, no cyclic
* reference could be formed.
*/
if (!vertex)
return;
unix_graph_maybe_cyclic = true;
unix_graph_grouped = false;
}
static LIST_HEAD(unix_unvisited_vertices);
enum unix_vertex_index {
UNIX_VERTEX_INDEX_MARK1,
UNIX_VERTEX_INDEX_MARK2,
UNIX_VERTEX_INDEX_START,
};
static unsigned long unix_vertex_unvisited_index = UNIX_VERTEX_INDEX_MARK1;
static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
{
struct unix_vertex *vertex = edge->predecessor->vertex;
if (!vertex) {
vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry);
vertex->index = unix_vertex_unvisited_index;
vertex->out_degree = 0;
INIT_LIST_HEAD(&vertex->edges);
INIT_LIST_HEAD(&vertex->scc_entry);
list_move_tail(&vertex->entry, &unix_unvisited_vertices);
edge->predecessor->vertex = vertex;
}
vertex->out_degree++;
list_add_tail(&edge->vertex_entry, &vertex->edges);
unix_update_graph(unix_edge_successor(edge));
}
static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
{
struct unix_vertex *vertex = edge->predecessor->vertex;
if (!fpl->dead)
unix_update_graph(unix_edge_successor(edge));
list_del(&edge->vertex_entry);
vertex->out_degree--;
if (!vertex->out_degree) {
edge->predecessor->vertex = NULL;
list_move_tail(&vertex->entry, &fpl->vertices);
}
}
static void unix_free_vertices(struct scm_fp_list *fpl)
{
struct unix_vertex *vertex, *next_vertex;
list_for_each_entry_safe(vertex, next_vertex, &fpl->vertices, entry) {
list_del(&vertex->entry);
kfree(vertex);
}
}
static DEFINE_SPINLOCK(unix_gc_lock);
unsigned int unix_tot_inflight;
void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver)
{
int i = 0, j = 0;
spin_lock(&unix_gc_lock);
if (!fpl->count_unix)
goto out;
do {
struct unix_sock *inflight = unix_get_socket(fpl->fp[j++]);
struct unix_edge *edge;
if (!inflight)
continue;
edge = fpl->edges + i++;
edge->predecessor = inflight;
edge->successor = receiver;
unix_add_edge(fpl, edge);
} while (i < fpl->count_unix);
receiver->scm_stat.nr_unix_fds += fpl->count_unix;
WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + fpl->count_unix);
out:
WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight + fpl->count);
spin_unlock(&unix_gc_lock);
fpl->inflight = true;
unix_free_vertices(fpl);
}
void unix_del_edges(struct scm_fp_list *fpl)
{
struct unix_sock *receiver;
int i = 0;
spin_lock(&unix_gc_lock);
if (!fpl->count_unix)
goto out;
do {
struct unix_edge *edge = fpl->edges + i++;
unix_del_edge(fpl, edge);
} while (i < fpl->count_unix);
if (!fpl->dead) {
receiver = fpl->edges[0].successor;
receiver->scm_stat.nr_unix_fds -= fpl->count_unix;
}
WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - fpl->count_unix);
out:
WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count);
spin_unlock(&unix_gc_lock);
fpl->inflight = false;
}
void unix_update_edges(struct unix_sock *receiver)
{
/* nr_unix_fds is only updated under unix_state_lock().
* If it's 0 here, the embryo socket is not part of the
* inflight graph, and GC will not see it, so no lock needed.
*/
if (!receiver->scm_stat.nr_unix_fds) {
receiver->listener = NULL;
} else { } else {
struct sk_buff *skb; spin_lock(&unix_gc_lock);
struct sk_buff *next; unix_update_graph(unix_sk(receiver->listener)->vertex);
receiver->listener = NULL;
spin_unlock(&unix_gc_lock);
}
}
int unix_prepare_fpl(struct scm_fp_list *fpl)
{
struct unix_vertex *vertex;
int i;
if (!fpl->count_unix)
return 0;
for (i = 0; i < fpl->count_unix; i++) {
vertex = kmalloc(sizeof(*vertex), GFP_KERNEL);
if (!vertex)
goto err;
list_add(&vertex->entry, &fpl->vertices);
}
fpl->edges = kvmalloc_array(fpl->count_unix, sizeof(*fpl->edges),
GFP_KERNEL_ACCOUNT);
if (!fpl->edges)
goto err;
return 0;
err:
unix_free_vertices(fpl);
return -ENOMEM;
}
void unix_destroy_fpl(struct scm_fp_list *fpl)
{
if (fpl->inflight)
unix_del_edges(fpl);
kvfree(fpl->edges);
unix_free_vertices(fpl);
}
static bool unix_vertex_dead(struct unix_vertex *vertex)
{
struct unix_edge *edge;
struct unix_sock *u; struct unix_sock *u;
LIST_HEAD(embryos); long total_ref;
/* For a listening socket collect the queued embryos list_for_each_entry(edge, &vertex->edges, vertex_entry) {
* and perform a scan on them as well. struct unix_vertex *next_vertex = unix_edge_successor(edge);
/* The vertex's fd can be received by a non-inflight socket. */
if (!next_vertex)
return false;
/* The vertex's fd can be received by an inflight socket in
* another SCC.
*/ */
spin_lock(&x->sk_receive_queue.lock); if (next_vertex->scc_index != vertex->scc_index)
skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { return false;
u = unix_sk(skb->sk);
/* An embryo cannot be in-flight, so it's safe
* to use the list link.
*/
BUG_ON(!list_empty(&u->link));
list_add_tail(&u->link, &embryos);
} }
spin_unlock(&x->sk_receive_queue.lock);
while (!list_empty(&embryos)) { /* No receiver exists out of the same SCC. */
u = list_entry(embryos.next, struct unix_sock, link);
scan_inflight(&u->sk, func, hitlist); edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry);
list_del_init(&u->link); u = edge->predecessor;
total_ref = file_count(u->sk.sk_socket->file);
/* If not close()d, total_ref > out_degree. */
if (total_ref != vertex->out_degree)
return false;
return true;
}
enum unix_recv_queue_lock_class {
U_RECVQ_LOCK_NORMAL,
U_RECVQ_LOCK_EMBRYO,
};
static void unix_collect_queue(struct unix_sock *u, struct sk_buff_head *hitlist)
{
skb_queue_splice_init(&u->sk.sk_receive_queue, hitlist);
#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
if (u->oob_skb) {
WARN_ON_ONCE(skb_unref(u->oob_skb));
u->oob_skb = NULL;
} }
#endif
}
static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist)
{
struct unix_vertex *vertex;
list_for_each_entry_reverse(vertex, scc, scc_entry) {
struct sk_buff_head *queue;
struct unix_edge *edge;
struct unix_sock *u;
edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry);
u = edge->predecessor;
queue = &u->sk.sk_receive_queue;
spin_lock(&queue->lock);
if (u->sk.sk_state == TCP_LISTEN) {
struct sk_buff *skb;
skb_queue_walk(queue, skb) {
struct sk_buff_head *embryo_queue = &skb->sk->sk_receive_queue;
/* listener -> embryo order, the inversion never happens. */
spin_lock_nested(&embryo_queue->lock, U_RECVQ_LOCK_EMBRYO);
unix_collect_queue(unix_sk(skb->sk), hitlist);
spin_unlock(&embryo_queue->lock);
}
} else {
unix_collect_queue(u, hitlist);
}
spin_unlock(&queue->lock);
} }
} }
static void dec_inflight(struct unix_sock *usk) static bool unix_scc_cyclic(struct list_head *scc)
{ {
usk->inflight--; struct unix_vertex *vertex;
struct unix_edge *edge;
/* SCC containing multiple vertices ? */
if (!list_is_singular(scc))
return true;
vertex = list_first_entry(scc, typeof(*vertex), scc_entry);
/* Self-reference or a embryo-listener circle ? */
list_for_each_entry(edge, &vertex->edges, vertex_entry) {
if (unix_edge_successor(edge) == vertex)
return true;
}
return false;
} }
static void inc_inflight(struct unix_sock *usk) static LIST_HEAD(unix_visited_vertices);
{ static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2;
usk->inflight++;
}
static void inc_inflight_move_tail(struct unix_sock *u) static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index,
struct sk_buff_head *hitlist)
{ {
u->inflight++; LIST_HEAD(vertex_stack);
struct unix_edge *edge;
LIST_HEAD(edge_stack);
/* If this still might be part of a cycle, move it to the end next_vertex:
* of the list, so that it's checked even if it was already /* Push vertex to vertex_stack and mark it as on-stack
* passed over * (index >= UNIX_VERTEX_INDEX_START).
* The vertex will be popped when finalising SCC later.
*/ */
if (test_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags)) list_add(&vertex->scc_entry, &vertex_stack);
list_move_tail(&u->link, &gc_candidates);
vertex->index = *last_index;
vertex->scc_index = *last_index;
(*last_index)++;
/* Explore neighbour vertices (receivers of the current vertex's fd). */
list_for_each_entry(edge, &vertex->edges, vertex_entry) {
struct unix_vertex *next_vertex = unix_edge_successor(edge);
if (!next_vertex)
continue;
if (next_vertex->index == unix_vertex_unvisited_index) {
/* Iterative deepening depth first search
*
* 1. Push a forward edge to edge_stack and set
* the successor to vertex for the next iteration.
*/
list_add(&edge->stack_entry, &edge_stack);
vertex = next_vertex;
goto next_vertex;
/* 2. Pop the edge directed to the current vertex
* and restore the ancestor for backtracking.
*/
prev_vertex:
edge = list_first_entry(&edge_stack, typeof(*edge), stack_entry);
list_del_init(&edge->stack_entry);
next_vertex = vertex;
vertex = edge->predecessor->vertex;
/* If the successor has a smaller scc_index, two vertices
* are in the same SCC, so propagate the smaller scc_index
* to skip SCC finalisation.
*/
vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index);
} else if (next_vertex->index != unix_vertex_grouped_index) {
/* Loop detected by a back/cross edge.
*
* The successor is on vertex_stack, so two vertices are in
* the same SCC. If the successor has a smaller *scc_index*,
* propagate it to skip SCC finalisation.
*/
vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index);
} else {
/* The successor was already grouped as another SCC */
}
}
if (vertex->index == vertex->scc_index) {
struct unix_vertex *v;
struct list_head scc;
bool scc_dead = true;
/* SCC finalised.
*
* If the scc_index was not updated, all the vertices above on
* vertex_stack are in the same SCC. Group them using scc_entry.
*/
__list_cut_position(&scc, &vertex_stack, &vertex->scc_entry);
list_for_each_entry_reverse(v, &scc, scc_entry) {
/* Don't restart DFS from this vertex in unix_walk_scc(). */
list_move_tail(&v->entry, &unix_visited_vertices);
/* Mark vertex as off-stack. */
v->index = unix_vertex_grouped_index;
if (scc_dead)
scc_dead = unix_vertex_dead(v);
}
if (scc_dead)
unix_collect_skb(&scc, hitlist);
else if (!unix_graph_maybe_cyclic)
unix_graph_maybe_cyclic = unix_scc_cyclic(&scc);
list_del(&scc);
}
/* Need backtracking ? */
if (!list_empty(&edge_stack))
goto prev_vertex;
}
static void unix_walk_scc(struct sk_buff_head *hitlist)
{
unsigned long last_index = UNIX_VERTEX_INDEX_START;
unix_graph_maybe_cyclic = false;
/* Visit every vertex exactly once.
* __unix_walk_scc() moves visited vertices to unix_visited_vertices.
*/
while (!list_empty(&unix_unvisited_vertices)) {
struct unix_vertex *vertex;
vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
__unix_walk_scc(vertex, &last_index, hitlist);
}
list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
swap(unix_vertex_unvisited_index, unix_vertex_grouped_index);
unix_graph_grouped = true;
}
static void unix_walk_scc_fast(struct sk_buff_head *hitlist)
{
unix_graph_maybe_cyclic = false;
while (!list_empty(&unix_unvisited_vertices)) {
struct unix_vertex *vertex;
struct list_head scc;
bool scc_dead = true;
vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
list_add(&scc, &vertex->scc_entry);
list_for_each_entry_reverse(vertex, &scc, scc_entry) {
list_move_tail(&vertex->entry, &unix_visited_vertices);
if (scc_dead)
scc_dead = unix_vertex_dead(vertex);
}
if (scc_dead)
unix_collect_skb(&scc, hitlist);
else if (!unix_graph_maybe_cyclic)
unix_graph_maybe_cyclic = unix_scc_cyclic(&scc);
list_del(&scc);
}
list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
} }
static bool gc_in_progress; static bool gc_in_progress;
static void __unix_gc(struct work_struct *work) static void __unix_gc(struct work_struct *work)
{ {
struct sk_buff *next_skb, *skb;
struct unix_sock *u;
struct unix_sock *next;
struct sk_buff_head hitlist; struct sk_buff_head hitlist;
struct list_head cursor; struct sk_buff *skb;
LIST_HEAD(not_cycle_list);
spin_lock(&unix_gc_lock); spin_lock(&unix_gc_lock);
/* First, select candidates for garbage collection. Only if (!unix_graph_maybe_cyclic) {
* in-flight sockets are considered, and from those only ones spin_unlock(&unix_gc_lock);
* which don't have any external reference. goto skip_gc;
*
* Holding unix_gc_lock will protect these candidates from
* being detached, and hence from gaining an external
* reference. Since there are no possible receivers, all
* buffers currently on the candidates' queues stay there
* during the garbage collection.
*
* We also know that no new candidate can be added onto the
* receive queues. Other, non candidate sockets _can_ be
* added to queue, so we must make sure only to touch
* candidates.
*
* Embryos, though never candidates themselves, affect which
* candidates are reachable by the garbage collector. Before
* being added to a listener's queue, an embryo may already
* receive data carrying SCM_RIGHTS, potentially making the
* passed socket a candidate that is not yet reachable by the
* collector. It becomes reachable once the embryo is
* enqueued. Therefore, we must ensure that no SCM-laden
* embryo appears in a (candidate) listener's queue between
* consecutive scan_children() calls.
*/
list_for_each_entry_safe(u, next, &gc_inflight_list, link) {
struct sock *sk = &u->sk;
long total_refs;
total_refs = file_count(sk->sk_socket->file);
BUG_ON(!u->inflight);
BUG_ON(total_refs < u->inflight);
if (total_refs == u->inflight) {
list_move_tail(&u->link, &gc_candidates);
__set_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
__set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
if (sk->sk_state == TCP_LISTEN) {
unix_state_lock_nested(sk, U_LOCK_GC_LISTENER);
unix_state_unlock(sk);
}
}
} }
/* Now remove all internal in-flight reference to children of __skb_queue_head_init(&hitlist);
* the candidates.
*/
list_for_each_entry(u, &gc_candidates, link)
scan_children(&u->sk, dec_inflight, NULL);
/* Restore the references for children of all candidates, if (unix_graph_grouped)
* which have remaining references. Do this recursively, so unix_walk_scc_fast(&hitlist);
* only those remain, which form cyclic references. else
* unix_walk_scc(&hitlist);
* Use a "cursor" link, to make the list traversal safe, even
* though elements might be moved about.
*/
list_add(&cursor, &gc_candidates);
while (cursor.next != &gc_candidates) {
u = list_entry(cursor.next, struct unix_sock, link);
/* Move cursor to after the current position. */
list_move(&cursor, &u->link);
if (u->inflight) {
list_move_tail(&u->link, &not_cycle_list);
__clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
scan_children(&u->sk, inc_inflight_move_tail, NULL);
}
}
list_del(&cursor);
/* Now gc_candidates contains only garbage. Restore original
* inflight counters for these as well, and remove the skbuffs
* which are creating the cycle(s).
*/
skb_queue_head_init(&hitlist);
list_for_each_entry(u, &gc_candidates, link) {
scan_children(&u->sk, inc_inflight, &hitlist);
#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
if (u->oob_skb) {
kfree_skb(u->oob_skb);
u->oob_skb = NULL;
}
#endif
}
/* not_cycle_list contains those sockets which do not make up a
* cycle. Restore these to the inflight list.
*/
while (!list_empty(&not_cycle_list)) {
u = list_entry(not_cycle_list.next, struct unix_sock, link);
__clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
list_move_tail(&u->link, &gc_inflight_list);
}
spin_unlock(&unix_gc_lock); spin_unlock(&unix_gc_lock);
/* We need io_uring to clean its registered files, ignore all io_uring skb_queue_walk(&hitlist, skb) {
* originated skbs. It's fine as io_uring doesn't keep references to if (UNIXCB(skb).fp)
* other io_uring instances and so killing all other files in the cycle UNIXCB(skb).fp->dead = true;
* will put all io_uring references forcing it to go through normal
* release.path eventually putting registered files.
*/
skb_queue_walk_safe(&hitlist, skb, next_skb) {
if (skb->destructor == io_uring_destruct_scm) {
__skb_unlink(skb, &hitlist);
skb_queue_tail(&skb->sk->sk_receive_queue, skb);
}
} }
/* Here we are. Hitlist is filled. Die. */
__skb_queue_purge(&hitlist); __skb_queue_purge(&hitlist);
skip_gc:
spin_lock(&unix_gc_lock);
/* There could be io_uring registered files, just push them back to
* the inflight list
*/
list_for_each_entry_safe(u, next, &gc_candidates, link)
list_move_tail(&u->link, &gc_inflight_list);
/* All candidates should have been detached by now. */
BUG_ON(!list_empty(&gc_candidates));
/* Paired with READ_ONCE() in wait_for_unix_gc(). */
WRITE_ONCE(gc_in_progress, false); WRITE_ONCE(gc_in_progress, false);
spin_unlock(&unix_gc_lock);
} }
static DECLARE_WORK(unix_gc_work, __unix_gc); static DECLARE_WORK(unix_gc_work, __unix_gc);
@@ -335,8 +605,9 @@ void unix_gc(void)
} }
#define UNIX_INFLIGHT_TRIGGER_GC 16000 #define UNIX_INFLIGHT_TRIGGER_GC 16000
#define UNIX_INFLIGHT_SANE_USER (SCM_MAX_FD * 8)
void wait_for_unix_gc(void) void wait_for_unix_gc(struct scm_fp_list *fpl)
{ {
/* If number of inflight sockets is insane, /* If number of inflight sockets is insane,
* force a garbage collect right now. * force a garbage collect right now.
@@ -348,6 +619,13 @@ void wait_for_unix_gc(void)
!READ_ONCE(gc_in_progress)) !READ_ONCE(gc_in_progress))
unix_gc(); unix_gc();
/* Penalise users who want to send AF_UNIX sockets
* but whose sockets have not been received yet.
*/
if (!fpl || !fpl->count_unix ||
READ_ONCE(fpl->user->unix_inflight) < UNIX_INFLIGHT_SANE_USER)
return;
if (READ_ONCE(gc_in_progress)) if (READ_ONCE(gc_in_progress))
flush_work(&unix_gc_work); flush_work(&unix_gc_work);
} }

View File

@@ -1,156 +0,0 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/net.h>
#include <linux/fs.h>
#include <net/af_unix.h>
#include <net/scm.h>
#include <linux/init.h>
#include <linux/io_uring.h>
#include "scm.h"
unsigned int unix_tot_inflight;
EXPORT_SYMBOL(unix_tot_inflight);
LIST_HEAD(gc_inflight_list);
EXPORT_SYMBOL(gc_inflight_list);
DEFINE_SPINLOCK(unix_gc_lock);
EXPORT_SYMBOL(unix_gc_lock);
struct unix_sock *unix_get_socket(struct file *filp)
{
struct inode *inode = file_inode(filp);
/* Socket ? */
if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) {
struct socket *sock = SOCKET_I(inode);
const struct proto_ops *ops = READ_ONCE(sock->ops);
struct sock *s = sock->sk;
/* PF_UNIX ? */
if (s && ops && ops->family == PF_UNIX)
return unix_sk(s);
}
return NULL;
}
EXPORT_SYMBOL(unix_get_socket);
/* Keep the number of times in flight count for the file
* descriptor if it is for an AF_UNIX socket.
*/
void unix_inflight(struct user_struct *user, struct file *fp)
{
struct unix_sock *u = unix_get_socket(fp);
spin_lock(&unix_gc_lock);
if (u) {
if (!u->inflight) {
BUG_ON(!list_empty(&u->link));
list_add_tail(&u->link, &gc_inflight_list);
} else {
BUG_ON(list_empty(&u->link));
}
u->inflight++;
/* Paired with READ_ONCE() in wait_for_unix_gc() */
WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1);
}
WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1);
spin_unlock(&unix_gc_lock);
}
void unix_notinflight(struct user_struct *user, struct file *fp)
{
struct unix_sock *u = unix_get_socket(fp);
spin_lock(&unix_gc_lock);
if (u) {
BUG_ON(!u->inflight);
BUG_ON(list_empty(&u->link));
u->inflight--;
if (!u->inflight)
list_del_init(&u->link);
/* Paired with READ_ONCE() in wait_for_unix_gc() */
WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1);
}
WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1);
spin_unlock(&unix_gc_lock);
}
/*
* The "user->unix_inflight" variable is protected by the garbage
* collection lock, and we just read it locklessly here. If you go
* over the limit, there might be a tiny race in actually noticing
* it across threads. Tough.
*/
static inline bool too_many_unix_fds(struct task_struct *p)
{
struct user_struct *user = current_user();
if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
return false;
}
int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
{
int i;
if (too_many_unix_fds(current))
return -ETOOMANYREFS;
/*
* Need to duplicate file references for the sake of garbage
* collection. Otherwise a socket in the fps might become a
* candidate for GC while the skb is not yet queued.
*/
UNIXCB(skb).fp = scm_fp_dup(scm->fp);
if (!UNIXCB(skb).fp)
return -ENOMEM;
for (i = scm->fp->count - 1; i >= 0; i--)
unix_inflight(scm->fp->user, scm->fp->fp[i]);
return 0;
}
EXPORT_SYMBOL(unix_attach_fds);
void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
{
int i;
scm->fp = UNIXCB(skb).fp;
UNIXCB(skb).fp = NULL;
for (i = scm->fp->count-1; i >= 0; i--)
unix_notinflight(scm->fp->user, scm->fp->fp[i]);
}
EXPORT_SYMBOL(unix_detach_fds);
void unix_destruct_scm(struct sk_buff *skb)
{
struct scm_cookie scm;
memset(&scm, 0, sizeof(scm));
scm.pid = UNIXCB(skb).pid;
if (UNIXCB(skb).fp)
unix_detach_fds(&scm, skb);
/* Alas, it calls VFS */
/* So fscking what? fput() had been SMP-safe since the last Summer */
scm_destroy(&scm);
sock_wfree(skb);
}
EXPORT_SYMBOL(unix_destruct_scm);
void io_uring_destruct_scm(struct sk_buff *skb)
{
unix_destruct_scm(skb);
}
EXPORT_SYMBOL(io_uring_destruct_scm);

View File

@@ -1,10 +0,0 @@
#ifndef NET_UNIX_SCM_H
#define NET_UNIX_SCM_H
extern struct list_head gc_inflight_list;
extern spinlock_t unix_gc_lock;
int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb);
void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb);
#endif

View File

@@ -105,7 +105,8 @@ static int __init sample_trace_array_init(void)
* NOTE: This function increments the reference counter * NOTE: This function increments the reference counter
* associated with the trace array - "tr". * associated with the trace array - "tr".
*/ */
tr = trace_array_get_by_name("sample-instance"); tr = trace_array_get_by_name_ext("sample-instance",
"sched,timer,kprobes");
if (!tr) if (!tr)
return -1; return -1;

View File

@@ -88,6 +88,76 @@ static void write_debugfs(const char *fmt, ...)
} }
} }
static char *allocate_zero_filled_hugepage(size_t len)
{
char *result;
size_t i;
result = memalign(pmd_pagesize, len);
if (!result) {
printf("Fail to allocate memory\n");
exit(EXIT_FAILURE);
}
madvise(result, len, MADV_HUGEPAGE);
for (i = 0; i < len; i++)
result[i] = (char)0;
return result;
}
static void verify_rss_anon_split_huge_page_all_zeroes(char *one_page, int nr_hpages, size_t len)
{
unsigned long rss_anon_before, rss_anon_after;
size_t i;
if (!check_huge_anon(one_page, 4, pmd_pagesize)) {
printf("No THP is allocated\n");
exit(EXIT_FAILURE);
}
rss_anon_before = rss_anon();
if (!rss_anon_before) {
printf("No RssAnon is allocated before split\n");
exit(EXIT_FAILURE);
}
/* split all THPs */
write_debugfs(PID_FMT, getpid(), (uint64_t)one_page,
(uint64_t)one_page + len, 0);
for (i = 0; i < len; i++)
if (one_page[i] != (char)0) {
printf("%ld byte corrupted\n", i);
exit(EXIT_FAILURE);
}
if (!check_huge_anon(one_page, 0, pmd_pagesize)) {
printf("Still AnonHugePages not split\n");
exit(EXIT_FAILURE);
}
rss_anon_after = rss_anon();
if (rss_anon_after >= rss_anon_before) {
printf("Incorrect RssAnon value. Before: %ld After: %ld\n",
rss_anon_before, rss_anon_after);
exit(EXIT_FAILURE);
}
}
void split_pmd_zero_pages(void)
{
char *one_page;
int nr_hpages = 4;
size_t len = nr_hpages * pmd_pagesize;
one_page = allocate_zero_filled_hugepage(len);
verify_rss_anon_split_huge_page_all_zeroes(one_page, nr_hpages, len);
printf("Split zero filled huge pages successful\n");
free(one_page);
}
void split_pmd_thp(void) void split_pmd_thp(void)
{ {
char *one_page; char *one_page;
@@ -305,6 +375,7 @@ int main(int argc, char **argv)
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
split_pmd_zero_pages();
split_pmd_thp(); split_pmd_thp();
split_pte_mapped_thp(); split_pte_mapped_thp();
split_file_backed_thp(); split_file_backed_thp();

View File

@@ -11,6 +11,7 @@
#define PMD_SIZE_FILE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" #define PMD_SIZE_FILE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
#define SMAP_FILE_PATH "/proc/self/smaps" #define SMAP_FILE_PATH "/proc/self/smaps"
#define STATUS_FILE_PATH "/proc/self/status"
#define MAX_LINE_LENGTH 500 #define MAX_LINE_LENGTH 500
unsigned int __page_size; unsigned int __page_size;
@@ -97,6 +98,27 @@ uint64_t read_pmd_pagesize(void)
return strtoul(buf, NULL, 10); return strtoul(buf, NULL, 10);
} }
unsigned long rss_anon(void)
{
unsigned long rss_anon = 0;
FILE *fp;
char buffer[MAX_LINE_LENGTH];
fp = fopen(STATUS_FILE_PATH, "r");
if (!fp)
ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, STATUS_FILE_PATH);
if (!check_for_pattern(fp, "RssAnon:", buffer, sizeof(buffer)))
goto err_out;
if (sscanf(buffer, "RssAnon:%10lu kB", &rss_anon) != 1)
ksft_exit_fail_msg("Reading status error\n");
err_out:
fclose(fp);
return rss_anon;
}
bool __check_huge(void *addr, char *pattern, int nr_hpages, bool __check_huge(void *addr, char *pattern, int nr_hpages,
uint64_t hpage_size) uint64_t hpage_size)
{ {

View File

@@ -39,6 +39,7 @@ unsigned long pagemap_get_pfn(int fd, char *start);
void clear_softdirty(void); void clear_softdirty(void);
bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len); bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len);
uint64_t read_pmd_pagesize(void); uint64_t read_pmd_pagesize(void);
unsigned long rss_anon(void);
bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size); bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size);
bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size); bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size);
bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size); bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size);