Merge android15-6.6 into android15-6.6-lts
This merges the android15-6.6 branch into the -lts branch, catching it up with the latest changes in there. It contains the following commits: *3a0107a38e
ANDROID: KVM: arm64: Ensure SVE initialization precedes PSCI for protected VCPUs *3b75103301
ANDROID: 16K: Use vma_area slab cache for pad VMA *a213abada8
UPSTREAM: af_unix: Fix uninit-value in __unix_walk_scc() *5156d49ed9
UPSTREAM: af_unix: Fix garbage collection of embryos carrying OOB with SCM_RIGHTS *fbd783363d
ANDROID: af_unix: Provide ABI fixes for recently introduced 'struct scm_fp_list' attribute 'dead' *ddd6979a15
UPSTREAM: af_unix: Add dead flag to struct scm_fp_list. *95a397ac6b
UPSTREAM: af_unix: Don't access successor in unix_del_edges() during GC. *a130d07d24
UPSTREAM: af_unix: Try not to hold unix_gc_lock during accept(). *5ada288086
UPSTREAM: af_unix: Remove lock dance in unix_peek_fds(). *11d208f893
UPSTREAM: af_unix: Replace garbage collection algorithm. *67a3a58da1
UPSTREAM: af_unix: Detect dead SCC. *b9f8dfdb54
UPSTREAM: af_unix: Assign a unique index to SCC. *b22b0a7597
UPSTREAM: af_unix: Avoid Tarjan's algorithm if unnecessary. *1e4d62adeb
UPSTREAM: af_unix: Skip GC if no cycle exists. *250c362acd
UPSTREAM: af_unix: Save O(n) setup of Tarjan's algo. *0c40a05117
UPSTREAM: af_unix: Fix up unix_edge.successor for embryo socket. *f5ea8b439d
UPSTREAM: af_unix: Save listener for embryo socket. *279ed20d5f
UPSTREAM: af_unix: Detect Strongly Connected Components. *16dca90335
UPSTREAM: af_unix: Iterate all vertices by DFS. *80df4d17af
UPSTREAM: af_unix: Bulk update unix_tot_inflight/unix_inflight when queuing skb. *40549e6976
ANDROID: af_unix: Provide ABI fixes for recently introduced 'struct scm_fp_list' attribute 'inflight' *769fc01f23
UPSTREAM: af_unix: Link struct unix_edge when queuing skb. *de6b1e85b9
ANDROID: af_unix: Provide ABI fixes for recently introduced 'struct scm_fp_list' attribute 'edges' *844c9666eb
UPSTREAM: af_unix: Allocate struct unix_edge for each inflight AF_UNIX fd. *c93b3ba51e
ANDROID: af_unix: Provide ABI fixes for recently introduced 'struct scm_fp_list' attribute 'vertices' *ffef32ddaf
UPSTREAM: af_unix: Allocate struct unix_vertex for each inflight AF_UNIX fd. *f972f2d7b1
ANDROID: af_unix: Allocate memory for the largest possible size of 'struct scm_fp_list' *b077571da9
UPSTREAM: af_unix: Remove CONFIG_UNIX_SCM. *a390e62751
ANDROID: Align x86-64 microdroid cgroup support with aarch64 microdroid *6dbb3c2e90
BACKPORT: mm: remove folio from deferred split list before uncharging it *a8553b4e2a
BACKPORT: mm: use __page_cache_release() in folios_put() *4d61851d14
UPSTREAM: mm: fix list corruption in put_pages_list *f61f355bdc
UPSTREAM: mm: use free_unref_folios() in put_pages_list() *316b2e6e4b
BACKPORT: mm: remove use of folio list from folios_put() *f9c6fb1b82
BACKPORT: memcg: add mem_cgroup_uncharge_folios() *3bc695b2be
Merge tag 'android15-6.6.92_r00' into android15-6.6 *0813441033
FROMGIT: scsi: core: ufs: Fix a hang in the error handler *a74f052176
FROMGIT: genirq/cpuhotplug: Restore affinity even for suspended IRQ *fc6844d9d2
FROMGIT: genirq/cpuhotplug: Rebalance managed interrupts across multi-CPU hotplug *0bc63a98d9
ANDROID: abi_gki_aarch64_vivo: Update symbol list *8fb77f6f9d
ANDROID: mm: Reset unused page flag bits on free *f0bd864fe0
Revert "ANDROID: mm: Set PAGE_BLOCK_ORDER to 7 when ARM64_16K_PAGES" *97f5b70ad3
ANDROID: GKI: Update symbol list for xiaomi *2bc7bc937c
BACKPORT: erofs: lazily initialize per-CPU workers and CPU hotplug hooks *434940a426
FROMGIT: scsi: ufs: mcq: Delete ufshcd_release_scsi_cmd() in ufshcd_mcq_abort() *0ac9aa9b62
ANDROID: GKI: Rename xring's symbol list. *f56b0532df
BACKPORT: mm: set pageblock_order to HPAGE_PMD_ORDER in case with !CONFIG_HUGETLB_PAGE but THP enabled *f19494634f
ANDROID: GKI: Update symbol list for vivo *68191d9c7a
ANDROID: vendor_hooks: add hook to retry mempool allocation without delay *45afa56280
ANDROID: mm: Set PAGE_BLOCK_ORDER to 7 when ARM64_16K_PAGES *3148030c78
ANDROID: KVM: arm64: Fix hyp_alloc(0) *4ec55296c6
ANDROID: fix out-of-bounds error when trace_create_new_event *d9ec0e18f4
ANDROID: CONFIG_CRYPTO_SHA1_ARM64_CE=y to GKI and Microdroid kernel *0272a2ffdc
BACKPORT: FROMGIT: mm: add CONFIG_PAGE_BLOCK_ORDER to select page block order *86ba3f3eb2
BACKPORT: binder: Create safe versions of binder log files *8a55e7a02a
UPSTREAM: binder: Refactor binder_node print synchronization *fe02cfa135
ANDROID: iommu/arm-smmu-v3-kvm: Fix accidental domain ID freeing in free() *9733cd1fa2
ANDROID: GKI: Update xiaomi symbol list. *125f87a148
UPSTREAM: mm/memcg: use kmem_cache when alloc memcg pernode info *78e6a3d422
UPSTREAM: mm/memcg: use kmem_cache when alloc memcg *b6bde4b648
UPSTREAM: mm/memcg: move mem_cgroup_init() ahead of cgroup_init() *476cb9bc9b
UPSTREAM: af_unix: Remove io_uring code for GC. *fb219cbb0b
UPSTREAM: af_unix: Replace BUG_ON() with WARN_ON_ONCE(). *3c39219343
ANDROID: Enable memory controller for microdroid *c6325b075d
ANDROID: cgroup: Fix cgroup_root backport padding calculation *452d899d2f
ANDROID: GKI: Fix up abi issue in struct scm_fp_list *cec9cb02ce
UPSTREAM: af_unix: Try to run GC async. *93c2d24134
BACKPORT: FROMGIT: usb: typec: tcpm: move tcpm_queue_vdm_unlocked to asynchronous work *ee016b98b7
BACKPORT: usb: typec: tcpm: enforce ready state when queueing alt mode vdm *4be94a6b03
ANDROID: ABI: Update pixel symbol list *6af2e78f07
ANDROID: fix ABI breakage for trace_array extensions *6f62c0d0fb
UPSTREAM: tracing: Allow creating instances with specified system events *f8d73c6178
UPSTREAM: af_unix: Run GC on only one CPU. *a70bd568b1
UPSTREAM: af_unix: Return struct unix_sock from unix_get_socket(). *c1b974e51d
UPSTREAM: iommu: Handle race with default domain setup *315fdde476
ANDROID: ABI: Update pixel symbol list *32288ce2f2
ANDROID: vendor_hooks: Add hooks for xhci reset *dd8fcb5398
ANDROID: GKI: deferred split queue corruption - ABI fixup *374babecde
UPSTREAM: mm/thp: fix deferred split queue not partially_mapped: fix *3a8faa5b25
BACKPORT: mm/thp: fix deferred split unqueue naming and locking *84cc354617
UPSTREAM: mm/thp: fix deferred split queue not partially_mapped *dd46964f3e
BACKPORT: mm: add sysfs entry to disable splitting underused THPs *40ffd525e5
UPSTREAM: mm: split underused THPs *a63eadb11d
BACKPORT: mm: introduce a pageflag for partially mapped folios *f1b73b0513
UPSTREAM: mm/migrate: fix kernel BUG at mm/compaction.c:2761! *cbbd153073
BACKPORT: mm/migrate: split source folio if it is on deferred split list *c6f085c328
BACKPORT: mm: count the number of partially mapped anonymous THPs per size *545db6094c
BACKPORT: mm: count the number of anonymous THPs per size *6ee860d0d4
UPSTREAM: mm: separate out FOLIO_FLAGS from PAGEFLAGS *f052bbc24d
UPSTREAM: mm: selftest to verify zero-filled pages are mapped to zeropage *d826c84482
BACKPORT: mm: remap unused subpages to shared zeropage when splitting isolated thp *bc9f1a0f43
Revert "BACKPORT: mm/thp: fix deferred split unqueue naming and locking" *c06fa3b5cd
ANDROID: GKI: page_alloc ABI fixup *819bdc71dc
BACKPORT: mm: page_alloc: batch vmstat updates in expand() *c97dfdfac0
UPSTREAM: mm/page_alloc: keep track of free highatomic *cdff4faf2b
UPSTREAM: mm: remove unused has_isolate_pageblock *5b5902fcf6
UPSTREAM: mm: page_alloc: fix highatomic typing in multi-block buddies *48e8763a95
BACKPORT: mm: page_alloc: consolidate free page accounting *a4f7bd4b3d
BACKPORT: mm: page_isolation: prepare for hygienic freelists *a8dcfbc68b
UPSTREAM: mm: page_alloc: set migratetype inside move_freepages() *209c219a0f
BACKPORT: mm: page_alloc: close migratetype race between freeing and stealing *1a3654f59a
BACKPORT: mm: page_alloc: fix freelist movement during block conversion *861e9d3c44
UPSTREAM: mm: page_alloc: fix move_freepages_block() range error *350c3b1d61
UPSTREAM: mm: page_alloc: move free pages when converting block during isolation *f76299151c
UPSTREAM: mm: page_alloc: fix up block types when merging compatible blocks *cb610236ed
UPSTREAM: mm: page_alloc: optimize free_unref_folios() *606130dacb
BACKPORT: mm: page_alloc: remove pcppage migratetype caching *a7a880e6de
UPSTREAM: mm: allow non-hugetlb large folios to be batch processed *f17c4db9cf
BACKPORT: mm: handle large folios in free_unref_folios() *c7f67cfb85
UPSTREAM: mm: use folios_put() in __folio_batch_release() *445fa9a71a
BACKPORT: mm: add free_unref_folios() *cc058410b3
BACKPORT: mm: convert free_unref_page_list() to use folios *980cb4e2ba
BACKPORT: mm: make folios_put() the basis of release_pages() *5f4ed005d7
Revert "BACKPORT: mm: page_alloc: remove pcppage migratetype caching" *bab99c1b7e
Revert "UPSTREAM: mm: page_alloc: fix up block types when merging compatible blocks" *94e3afbb3d
Revert "UPSTREAM: mm: page_alloc: move free pages when converting block during isolation" *13aa15180a
Revert "UPSTREAM: mm: page_alloc: fix move_freepages_block() range error" *d47518de38
Revert "UPSTREAM: mm: page_alloc: fix freelist movement during block conversion" *135ab7374e
Revert "BACKPORT: mm: page_alloc: close migratetype race between freeing and stealing" *9ed2d2fba2
Revert "UPSTREAM: mm: page_alloc: set migratetype inside move_freepages()" *efbdb11ac1
Revert "BACKPORT: mm: page_isolation: prepare for hygienic freelists" *7d424e0f80
Revert "BACKPORT: mm: page_alloc: consolidate free page accounting" *8a91cd1d26
Revert "BACKPORT: mm: page_alloc: batch vmstat updates in expand()" *be6d3cc085
Revert "UPSTREAM: mm: page_alloc: fix highatomic typing in multi-block buddies" *bbc65a78d2
Revert "BACKPORT: mm/page_alloc: keep track of free highatomic" *a7a0d95bca
Revert "BACKPORT: mm: page_alloc: optimize free_unref_folios()" *8b5d78fb5c
Revert "ANDROID: fuse-bpf: fix wrong logic in read backing" *c1488e58c3
ANDROID: GKI: Update symbol list for Nvidia *1e3d640b05
ANDROID: GKI: Add initial Nvidia symbol list *5fa476bd0b
ANDROID: Add ufs headers to aarch64 allowlist *17daf81bcc
ANDROID: KVM: arm64: Allow relinqush for p-guest with huge-mappings *297e1ff805
ANDROID: KVM: arm64: Use unmap for pKVM guests memory relinquish *7c95a219c0
ANDROID: KVM: arm64: Add hyp request SPLIT *e56d181356
ANDROID: KVM: arm64: Convert kvm_pinned_pages to an interval-tree *390699f93d
ANDROID: KVM: arm64: Add host_split_guest for pKVM *16df80ab9c
ANDROID: KVM: arm64: Disable relinquish for p-guest huge-mappings *549ac47ca0
FROMGIT: PM: runtime: fix denying of auto suspend in pm_suspend_timer_fn() *4cdfd02ff2
ANDROID: Enable SHA1 for microdroid *ab0ad8d198
BACKPORT: mm: page_alloc: optimize free_unref_folios() Change-Id: Ic5571553dd22417e2ff66c8e99c114b8d79476f2 Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
This commit is contained in:
@@ -126,6 +126,7 @@ filegroup(
|
||||
"android/abi_gki_aarch64_mtk",
|
||||
"android/abi_gki_aarch64_mtktv",
|
||||
"android/abi_gki_aarch64_nothing",
|
||||
"android/abi_gki_aarch64_nvidia",
|
||||
"android/abi_gki_aarch64_oplus",
|
||||
"android/abi_gki_aarch64_paragon",
|
||||
"android/abi_gki_aarch64_pixel",
|
||||
@@ -140,7 +141,7 @@ filegroup(
|
||||
"android/abi_gki_aarch64_virtual_device",
|
||||
"android/abi_gki_aarch64_vivo",
|
||||
"android/abi_gki_aarch64_xiaomi",
|
||||
"android/abi_gki_aarch64_xiaomi2",
|
||||
"android/abi_gki_aarch64_xiaomi_xring",
|
||||
],
|
||||
visibility = ["//visibility:public"],
|
||||
)
|
||||
@@ -1028,6 +1029,9 @@ ddk_headers(
|
||||
"drivers/pci/controller/dwc/pcie-designware.h",
|
||||
"drivers/thermal/thermal_core.h",
|
||||
"drivers/thermal/thermal_netlink.h",
|
||||
"drivers/ufs/core/ufshcd-crypto.h",
|
||||
"drivers/ufs/core/ufshcd-priv.h",
|
||||
"drivers/ufs/host/ufshcd-pltfrm.h",
|
||||
"drivers/usb/dwc3/core.h",
|
||||
"sound/usb/card.h",
|
||||
"sound/usb/usbaudio.h",
|
||||
@@ -1045,6 +1049,7 @@ ddk_headers(
|
||||
"drivers/extcon",
|
||||
"drivers/pci/controller/dwc",
|
||||
"drivers/thermal",
|
||||
"drivers/ufs",
|
||||
"drivers/usb",
|
||||
"sound/usb",
|
||||
"include",
|
||||
|
@@ -202,6 +202,16 @@ PMD-mappable transparent hugepage::
|
||||
|
||||
cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size
|
||||
|
||||
All THPs at fault and collapse time will be added to _deferred_list,
|
||||
and will therefore be split under memory presure if they are considered
|
||||
"underused". A THP is underused if the number of zero-filled pages in
|
||||
the THP is above max_ptes_none (see below). It is possible to disable
|
||||
this behaviour by writing 0 to shrink_underused, and enable it by writing
|
||||
1 to it::
|
||||
|
||||
echo 0 > /sys/kernel/mm/transparent_hugepage/shrink_underused
|
||||
echo 1 > /sys/kernel/mm/transparent_hugepage/shrink_underused
|
||||
|
||||
khugepaged will be automatically started when one or more hugepage
|
||||
sizes are enabled (either by directly setting "always" or "madvise",
|
||||
or by setting "inherit" while the top-level enabled is set to "always"
|
||||
@@ -443,6 +453,12 @@ thp_deferred_split_page
|
||||
splitting it would free up some memory. Pages on split queue are
|
||||
going to be split under memory pressure.
|
||||
|
||||
thp_underused_split_page
|
||||
is incremented when a huge page on the split queue was split
|
||||
because it was underused. A THP is underused if the number of
|
||||
zero pages in the THP is above a certain threshold
|
||||
(/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none).
|
||||
|
||||
thp_split_pmd
|
||||
is incremented every time a PMD split into table of PTEs.
|
||||
This can happen, for instance, when application calls mprotect() or
|
||||
@@ -510,6 +526,18 @@ split_deferred
|
||||
it would free up some memory. Pages on split queue are going to
|
||||
be split under memory pressure, if splitting is possible.
|
||||
|
||||
nr_anon
|
||||
the number of anonymous THP we have in the whole system. These THPs
|
||||
might be currently entirely mapped or have partially unmapped/unused
|
||||
subpages.
|
||||
|
||||
nr_anon_partially_mapped
|
||||
the number of anonymous THP which are likely partially mapped, possibly
|
||||
wasting memory, and have been queued for deferred memory reclamation.
|
||||
Note that in corner some cases (e.g., failed migration), we might detect
|
||||
an anonymous THP as "partially mapped" and count it here, even though it
|
||||
is not actually partially mapped anymore.
|
||||
|
||||
As the system ages, allocating huge pages may be expensive as the
|
||||
system uses memory compaction to copy data around memory to free a
|
||||
huge page for use. There are some counters in ``/proc/vmstat`` to help
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -132,3 +132,84 @@ type 'struct io_ring_ctx' changed
|
||||
1 variable symbol(s) removed
|
||||
'struct tracepoint __tracepoint_android_vh_filemap_fault_before_folio_locked'
|
||||
|
||||
type 'struct kvm_protected_vm' changed
|
||||
member 'struct maple_tree pinned_pages' was removed
|
||||
member 'union { struct rb_root_cached pinned_pages; struct { struct maple_tree __unused; }; union { }; }' was added
|
||||
|
||||
type 'struct kvm_hyp_req' changed
|
||||
member changed from 'union { struct { u8 dest; int nr_pages; int sz_alloc; } mem; struct { unsigned long guest_ipa; size_t size; } map; }' to 'union { struct { u8 dest; int nr_pages; int sz_alloc; } mem; struct { unsigned long guest_ipa; size_t size; } map; struct { unsigned long guest_ipa; size_t size; } split; }'
|
||||
type changed from 'union { struct { u8 dest; int nr_pages; int sz_alloc; } mem; struct { unsigned long guest_ipa; size_t size; } map; }' to 'union { struct { u8 dest; int nr_pages; int sz_alloc; } mem; struct { unsigned long guest_ipa; size_t size; } map; struct { unsigned long guest_ipa; size_t size; } split; }'
|
||||
member 'struct { unsigned long guest_ipa; size_t size; } split' was added
|
||||
|
||||
type 'struct scm_fp_list' changed
|
||||
byte size changed from 2040 to 2048
|
||||
member 'short count_unix' was added
|
||||
|
||||
type 'struct scm_fp_list' changed
|
||||
byte size changed from 2048 to 2064
|
||||
member 'struct list_head vertices' was added
|
||||
member 'short count_unix' changed
|
||||
offset changed by 128
|
||||
|
||||
type 'struct scm_fp_list' changed
|
||||
byte size changed from 2064 to 2072
|
||||
member 'struct unix_edge* edges' was added
|
||||
member 'short count_unix' changed
|
||||
offset changed by 64
|
||||
|
||||
type 'struct scm_fp_list' changed
|
||||
byte size changed from 2072 to 2080
|
||||
member 'bool inflight' was added
|
||||
3 members ('struct list_head vertices' .. 'short count_unix') changed
|
||||
offset changed by 64
|
||||
|
||||
type 'struct unix_edge' changed
|
||||
byte size changed from 32 to 48
|
||||
member 'struct list_head stack_entry' was added
|
||||
|
||||
type 'struct unix_vertex' changed
|
||||
byte size changed from 40 to 48
|
||||
member 'unsigned long index' was added
|
||||
|
||||
type 'struct unix_vertex' changed
|
||||
byte size changed from 48 to 80
|
||||
member 'struct list_head scc_entry' was added
|
||||
2 members ('unsigned long out_degree' .. 'unsigned long index') changed
|
||||
offset changed by 128
|
||||
member 'unsigned long lowlink' was added
|
||||
member 'bool on_stack' was added
|
||||
|
||||
type 'struct unix_sock' changed
|
||||
member 'struct sock* listener' was added
|
||||
4 members ('struct list_head link' .. 'unsigned long gc_flags') changed
|
||||
offset changed by 64
|
||||
|
||||
type 'struct unix_vertex' changed
|
||||
byte size changed from 80 to 72
|
||||
member 'bool on_stack' was removed
|
||||
|
||||
type 'struct unix_vertex' changed
|
||||
member 'unsigned long lowlink' was removed
|
||||
member 'unsigned long scc_index' was added
|
||||
|
||||
type 'struct unix_sock' changed
|
||||
byte size changed from 1216 to 1152
|
||||
member 'struct list_head link' was removed
|
||||
member 'unsigned long inflight' was removed
|
||||
member 'spinlock_t lock' changed
|
||||
offset changed by -192
|
||||
member 'unsigned long gc_flags' was removed
|
||||
4 members ('struct socket_wq peer_wq' .. 'struct sk_buff* oob_skb') changed
|
||||
offset changed by -512
|
||||
|
||||
type 'struct unix_sock' changed
|
||||
member 'struct sk_buff* oob_skb' changed
|
||||
offset changed by 64
|
||||
|
||||
type 'struct scm_stat' changed
|
||||
byte size changed from 4 to 16
|
||||
member 'unsigned long nr_unix_fds' was added
|
||||
|
||||
type 'struct scm_fp_list' changed
|
||||
member 'bool dead' was added
|
||||
|
||||
|
232
android/abi_gki_aarch64_nvidia
Normal file
232
android/abi_gki_aarch64_nvidia
Normal file
@@ -0,0 +1,232 @@
|
||||
[abi_symbol_list]
|
||||
# commonly used symbols
|
||||
alloc_chrdev_region
|
||||
alt_cb_patch_nops
|
||||
__arch_copy_from_user
|
||||
__arch_copy_to_user
|
||||
cdev_add
|
||||
cdev_del
|
||||
cdev_init
|
||||
__check_object_size
|
||||
class_create
|
||||
class_destroy
|
||||
complete
|
||||
dev_driver_string
|
||||
_dev_err
|
||||
device_create
|
||||
device_destroy
|
||||
_dev_info
|
||||
devm_kfree
|
||||
devm_kmalloc
|
||||
devm_memremap
|
||||
devm_request_threaded_irq
|
||||
_dev_warn
|
||||
fortify_panic
|
||||
free_irq
|
||||
__init_swait_queue_head
|
||||
init_timer_key
|
||||
__init_waitqueue_head
|
||||
jiffies_to_usecs
|
||||
kfree
|
||||
__kmalloc
|
||||
kmalloc_caches
|
||||
kmalloc_trace
|
||||
kstrtouint
|
||||
log_post_read_mmio
|
||||
log_read_mmio
|
||||
memcpy
|
||||
__memcpy_fromio
|
||||
memset
|
||||
module_layout
|
||||
__mutex_init
|
||||
mutex_lock
|
||||
mutex_unlock
|
||||
of_find_property
|
||||
of_property_read_u32_index
|
||||
of_property_read_variable_u32_array
|
||||
panic
|
||||
pid_task
|
||||
__platform_driver_register
|
||||
platform_driver_unregister
|
||||
_printk
|
||||
__put_task_struct
|
||||
_raw_spin_lock
|
||||
_raw_spin_unlock
|
||||
request_threaded_irq
|
||||
schedule_timeout
|
||||
snprintf
|
||||
__stack_chk_fail
|
||||
strlen
|
||||
strncmp
|
||||
strnlen
|
||||
strscpy
|
||||
sysfs_create_group
|
||||
sysfs_remove_group
|
||||
system_cpucaps
|
||||
system_wq
|
||||
tegra_ivc_notified
|
||||
tegra_ivc_read_advance
|
||||
tegra_ivc_read_get_next_frame
|
||||
tegra_ivc_reset
|
||||
tegra_ivc_write_advance
|
||||
tegra_ivc_write_get_next_frame
|
||||
__traceiter_rwmmio_post_read
|
||||
__traceiter_rwmmio_read
|
||||
__tracepoint_rwmmio_post_read
|
||||
__tracepoint_rwmmio_read
|
||||
unregister_chrdev_region
|
||||
__wake_up
|
||||
__warn_printk
|
||||
|
||||
# required by ivc-cdev.ko
|
||||
device_del
|
||||
devm_free_irq
|
||||
noop_llseek
|
||||
remap_pfn_range
|
||||
|
||||
# required by ivc_ext.ko
|
||||
dma_sync_single_for_cpu
|
||||
__memcpy_toio
|
||||
|
||||
# required by nvsciipc.ko
|
||||
_dev_notice
|
||||
__fdget
|
||||
find_get_pid
|
||||
fput
|
||||
platform_device_register_full
|
||||
platform_device_unregister
|
||||
sprintf
|
||||
|
||||
# required by tegra_bpmp.ko
|
||||
clk_hw_determine_rate_no_reparent
|
||||
clk_hw_get_name
|
||||
clk_hw_unregister
|
||||
debugfs_create_dir
|
||||
debugfs_create_file
|
||||
debugfs_remove
|
||||
dentry_path_raw
|
||||
devm_clk_hw_register
|
||||
devm_reset_controller_register
|
||||
dma_alloc_attrs
|
||||
dma_free_attrs
|
||||
_find_next_bit
|
||||
kmalloc_large
|
||||
kstrdup
|
||||
ktime_get
|
||||
of_clk_add_hw_provider
|
||||
of_device_get_match_data
|
||||
of_genpd_add_provider_onecell
|
||||
__of_parse_phandle_with_args
|
||||
of_platform_default_populate
|
||||
pm_genpd_init
|
||||
pm_genpd_remove
|
||||
seq_lseek
|
||||
seq_read
|
||||
seq_write
|
||||
single_open_size
|
||||
single_release
|
||||
strncpy
|
||||
tegra_bpmp_free_mrq
|
||||
tegra_bpmp_mrq_is_supported
|
||||
tegra_bpmp_mrq_return
|
||||
tegra_bpmp_request_mrq
|
||||
tegra_bpmp_transfer
|
||||
tegra_bpmp_transfer_atomic
|
||||
tegra_sku_info
|
||||
|
||||
# required by tegra_hv.ko
|
||||
arm64_use_ng_mappings
|
||||
class_create_file_ns
|
||||
ioremap_prot
|
||||
iounmap
|
||||
irq_get_irq_data
|
||||
memstart_addr
|
||||
of_add_property
|
||||
of_chosen
|
||||
of_find_compatible_node
|
||||
of_irq_get
|
||||
pfn_is_map_memory
|
||||
tegra_ivc_init
|
||||
|
||||
# required by tegra_hv_pm_ctl.ko
|
||||
__alloc_skb
|
||||
find_vpid
|
||||
finish_wait
|
||||
init_net
|
||||
init_wait_entry
|
||||
msleep
|
||||
__netlink_kernel_create
|
||||
netlink_unicast
|
||||
__nlmsg_put
|
||||
prepare_to_wait_event
|
||||
register_pm_notifier
|
||||
schedule
|
||||
strcmp
|
||||
wait_for_completion_timeout
|
||||
|
||||
# required by tegra_hv_vblk_oops.ko
|
||||
delayed_work_timer_fn
|
||||
dma_map_page_attrs
|
||||
__get_free_pages
|
||||
is_vmalloc_addr
|
||||
queue_delayed_work_on
|
||||
|
||||
# required by tegra_vblk.ko
|
||||
blk_execute_rq
|
||||
blk_mq_alloc_disk_for_queue
|
||||
blk_mq_alloc_request
|
||||
blk_mq_alloc_tag_set
|
||||
blk_mq_destroy_queue
|
||||
blk_mq_end_request
|
||||
blk_mq_free_request
|
||||
blk_mq_free_tag_set
|
||||
blk_mq_init_queue
|
||||
blk_mq_start_hw_queues
|
||||
blk_mq_start_request
|
||||
blk_mq_stop_hw_queues
|
||||
blk_queue_flag_set
|
||||
blk_queue_logical_block_size
|
||||
blk_queue_max_discard_sectors
|
||||
blk_queue_max_hw_sectors
|
||||
blk_queue_max_secure_erase_sectors
|
||||
blk_queue_physical_block_size
|
||||
blk_queue_write_cache
|
||||
__blk_rq_map_sg
|
||||
capable
|
||||
__cpu_possible_mask
|
||||
del_gendisk
|
||||
device_add_disk
|
||||
device_create_file
|
||||
disable_irq
|
||||
disk_check_media_change
|
||||
dma_map_sg_attrs
|
||||
dma_unmap_sg_attrs
|
||||
enable_irq
|
||||
_find_first_zero_bit
|
||||
jiffies
|
||||
kasan_flag_enabled
|
||||
kthread_create_on_cpu
|
||||
kthread_create_on_node
|
||||
__list_add_valid_or_report
|
||||
__list_del_entry_valid_or_report
|
||||
mod_timer
|
||||
__num_online_cpus
|
||||
of_find_node_by_name
|
||||
put_disk
|
||||
queue_work_on
|
||||
_raw_spin_lock_irqsave
|
||||
_raw_spin_unlock_irqrestore
|
||||
__register_blkdev
|
||||
sched_setattr_nocheck
|
||||
set_capacity
|
||||
set_disk_ro
|
||||
sg_init_table
|
||||
sg_nents
|
||||
__sw_hweight64
|
||||
timer_delete
|
||||
unregister_blkdev
|
||||
vfree
|
||||
vzalloc
|
||||
wait_for_completion
|
||||
wait_for_completion_interruptible
|
||||
wake_up_process
|
@@ -883,6 +883,7 @@
|
||||
drm_mode_duplicate
|
||||
drm_mode_equal
|
||||
drm_mode_equal_no_clocks
|
||||
drm_mode_is_420_only
|
||||
drm_mode_object_find
|
||||
drm_mode_object_get
|
||||
drm_mode_object_put
|
||||
@@ -2620,6 +2621,7 @@
|
||||
touch_softlockup_watchdog
|
||||
trace_array_destroy
|
||||
trace_array_get_by_name
|
||||
trace_array_get_by_name_ext
|
||||
trace_array_put
|
||||
trace_array_set_clr_event
|
||||
trace_event_buffer_commit
|
||||
@@ -2731,6 +2733,7 @@
|
||||
__traceiter_android_vh_ufs_update_sysfs
|
||||
__traceiter_android_vh_usb_dev_resume
|
||||
__traceiter_android_vh_use_amu_fie
|
||||
__traceiter_android_vh_xhci_full_reset_on_remove
|
||||
__traceiter_clock_set_rate
|
||||
__traceiter_cma_alloc_finish
|
||||
__traceiter_cma_alloc_start
|
||||
@@ -2869,6 +2872,7 @@
|
||||
__tracepoint_android_vh_ufs_update_sysfs
|
||||
__tracepoint_android_vh_usb_dev_resume
|
||||
__tracepoint_android_vh_use_amu_fie
|
||||
__tracepoint_android_vh_xhci_full_reset_on_remove
|
||||
__tracepoint_clock_set_rate
|
||||
__tracepoint_cma_alloc_finish
|
||||
__tracepoint_cma_alloc_start
|
||||
|
@@ -154,6 +154,8 @@
|
||||
__traceiter_android_vh_look_around_migrate_folio
|
||||
__traceiter_android_vh_lruvec_add_folio
|
||||
__traceiter_android_vh_lruvec_del_folio
|
||||
__traceiter_android_vh_mempool_alloc_skip_wait
|
||||
__traceiter_android_vh_mm_free_page
|
||||
__traceiter_android_vh_mmap_region
|
||||
__traceiter_android_vh_mutex_init
|
||||
__traceiter_android_vh_mutex_unlock_slowpath
|
||||
@@ -284,6 +286,8 @@
|
||||
__tracepoint_android_vh_look_around_migrate_folio
|
||||
__tracepoint_android_vh_lruvec_add_folio
|
||||
__tracepoint_android_vh_lruvec_del_folio
|
||||
__tracepoint_android_vh_mempool_alloc_skip_wait
|
||||
__tracepoint_android_vh_mm_free_page
|
||||
__tracepoint_android_vh_mmap_region
|
||||
__tracepoint_android_vh_mutex_init
|
||||
__tracepoint_android_vh_mutex_unlock_slowpath
|
||||
|
@@ -23,6 +23,8 @@
|
||||
__tracepoint_android_vh_tune_swappiness
|
||||
__traceiter_android_vh_do_shrink_slab_ex
|
||||
__tracepoint_android_vh_do_shrink_slab_ex
|
||||
__traceiter_android_vh_migration_target_bypass
|
||||
__tracepoint_android_vh_migration_target_bypass
|
||||
|
||||
# required by lz4 decompress module
|
||||
__tracepoint_android_vh_lz4_decompress_bypass
|
||||
|
@@ -1911,6 +1911,7 @@
|
||||
scsi_report_bus_reset
|
||||
scsi_scan_host
|
||||
scsi_unblock_requests
|
||||
scsi_host_busy
|
||||
sdev_prefix_printk
|
||||
security_file_ioctl
|
||||
select_fallback_rq
|
@@ -737,6 +737,7 @@ CONFIG_CRYPTO_LZ4=y
|
||||
CONFIG_CRYPTO_ZSTD=y
|
||||
CONFIG_CRYPTO_ANSI_CPRNG=y
|
||||
CONFIG_CRYPTO_GHASH_ARM64_CE=y
|
||||
CONFIG_CRYPTO_SHA1_ARM64_CE=y
|
||||
CONFIG_CRYPTO_SHA2_ARM64_CE=y
|
||||
CONFIG_CRYPTO_SHA512_ARM64_CE=y
|
||||
CONFIG_CRYPTO_POLYVAL_ARM64_CE=y
|
||||
|
@@ -8,6 +8,8 @@ CONFIG_RCU_EXPERT=y
|
||||
CONFIG_IKCONFIG=y
|
||||
CONFIG_IKCONFIG_PROC=y
|
||||
CONFIG_LOG_BUF_SHIFT=14
|
||||
CONFIG_CGROUPS=y
|
||||
CONFIG_MEMCG=y
|
||||
# CONFIG_RD_GZIP is not set
|
||||
# CONFIG_RD_BZIP2 is not set
|
||||
# CONFIG_RD_LZMA is not set
|
||||
@@ -136,8 +138,10 @@ CONFIG_STATIC_USERMODEHELPER_PATH=""
|
||||
CONFIG_SECURITY_SELINUX=y
|
||||
CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y
|
||||
CONFIG_BUG_ON_DATA_CORRUPTION=y
|
||||
CONFIG_CRYPTO_SHA1=y
|
||||
CONFIG_CRYPTO_HCTR2=y
|
||||
CONFIG_CRYPTO_LZO=y
|
||||
CONFIG_CRYPTO_SHA1_ARM64_CE=y
|
||||
CONFIG_CRYPTO_SHA2_ARM64_CE=y
|
||||
CONFIG_CRYPTO_POLYVAL_ARM64_CE=y
|
||||
CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
|
||||
|
@@ -83,6 +83,7 @@ enum __kvm_host_smccc_func {
|
||||
__KVM_HOST_SMCCC_FUNC___pkvm_relax_perms,
|
||||
__KVM_HOST_SMCCC_FUNC___pkvm_wrprotect,
|
||||
__KVM_HOST_SMCCC_FUNC___pkvm_dirty_log,
|
||||
__KVM_HOST_SMCCC_FUNC___pkvm_host_split_guest,
|
||||
__KVM_HOST_SMCCC_FUNC___pkvm_tlb_flush_vmid,
|
||||
__KVM_HOST_SMCCC_FUNC___kvm_adjust_pc,
|
||||
__KVM_HOST_SMCCC_FUNC___kvm_vcpu_run,
|
||||
|
@@ -224,20 +224,36 @@ struct kvm_smccc_features {
|
||||
};
|
||||
|
||||
struct kvm_pinned_page {
|
||||
union {
|
||||
struct rb_node node;
|
||||
struct list_head list_node;
|
||||
};
|
||||
struct page *page;
|
||||
u64 ipa;
|
||||
u64 __subtree_last;
|
||||
u8 order;
|
||||
u16 pins;
|
||||
};
|
||||
|
||||
#define KVM_DUMMY_PPAGE ((struct kvm_pinned_page *)-1)
|
||||
struct kvm_pinned_page
|
||||
*kvm_pinned_pages_iter_first(struct rb_root_cached *root, u64 start, u64 end);
|
||||
struct kvm_pinned_page
|
||||
*kvm_pinned_pages_iter_next(struct kvm_pinned_page *ppage, u64 start, u64 end);
|
||||
|
||||
#define for_ppage_node_in_range(kvm, start, end, __ppage, __tmp) \
|
||||
for (__ppage = kvm_pinned_pages_iter_first(&(kvm)->arch.pkvm.pinned_pages, start, end - 1);\
|
||||
__ppage && ({ __tmp = kvm_pinned_pages_iter_next(__ppage, start, end - 1); 1; }); \
|
||||
__ppage = __tmp)
|
||||
|
||||
void kvm_pinned_pages_remove(struct kvm_pinned_page *ppage,
|
||||
struct rb_root_cached *root);
|
||||
|
||||
typedef unsigned int pkvm_handle_t;
|
||||
|
||||
struct kvm_protected_vm {
|
||||
pkvm_handle_t handle;
|
||||
struct kvm_hyp_memcache stage2_teardown_mc;
|
||||
struct maple_tree pinned_pages;
|
||||
_ANDROID_KABI_REPLACE(struct maple_tree __unused, struct rb_root_cached pinned_pages);
|
||||
gpa_t pvmfw_load_addr;
|
||||
bool enabled;
|
||||
};
|
||||
@@ -525,6 +541,7 @@ struct kvm_hyp_req {
|
||||
#define KVM_HYP_LAST_REQ 0
|
||||
#define KVM_HYP_REQ_TYPE_MEM 1
|
||||
#define KVM_HYP_REQ_TYPE_MAP 2
|
||||
#define KVM_HYP_REQ_TYPE_SPLIT 3
|
||||
u8 type;
|
||||
union {
|
||||
struct {
|
||||
@@ -539,6 +556,12 @@ struct kvm_hyp_req {
|
||||
unsigned long guest_ipa;
|
||||
size_t size;
|
||||
} map;
|
||||
#ifndef __GENKSYMS__
|
||||
struct {
|
||||
unsigned long guest_ipa;
|
||||
size_t size;
|
||||
} split;
|
||||
#endif
|
||||
};
|
||||
};
|
||||
|
||||
|
@@ -184,6 +184,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
|
||||
|
||||
int kvm_handle_guest_abort(struct kvm_vcpu *vcpu);
|
||||
int pkvm_mem_abort_range(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, size_t size);
|
||||
int __pkvm_pgtable_stage2_split(struct kvm_vcpu *vcpu, phys_addr_t ipa, size_t size);
|
||||
|
||||
phys_addr_t kvm_mmu_get_httbr(void);
|
||||
phys_addr_t kvm_get_idmap_vector(void);
|
||||
|
@@ -862,8 +862,7 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
|
||||
* kvm_pgtable_stage2_split() is best effort: it tries to break as many
|
||||
* blocks in the input range as allowed by @mc_capacity.
|
||||
*/
|
||||
int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
|
||||
struct kvm_mmu_memory_cache *mc);
|
||||
int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, void *mc);
|
||||
|
||||
/**
|
||||
* kvm_pgtable_walk() - Walk a page-table.
|
||||
|
@@ -363,6 +363,11 @@ static int handle_hyp_req_map(struct kvm_vcpu *vcpu,
|
||||
return pkvm_mem_abort_range(vcpu, req->map.guest_ipa, req->map.size);
|
||||
}
|
||||
|
||||
static int handle_hyp_req_split(struct kvm_vcpu *vcpu, struct kvm_hyp_req *req)
|
||||
{
|
||||
return __pkvm_pgtable_stage2_split(vcpu, req->split.guest_ipa, req->split.size);
|
||||
}
|
||||
|
||||
static int handle_hyp_req(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct kvm_hyp_req *hyp_req = vcpu->arch.hyp_reqs;
|
||||
@@ -379,6 +384,9 @@ static int handle_hyp_req(struct kvm_vcpu *vcpu)
|
||||
case KVM_HYP_REQ_TYPE_MAP:
|
||||
ret = handle_hyp_req_map(vcpu, hyp_req);
|
||||
break;
|
||||
case KVM_HYP_REQ_TYPE_SPLIT:
|
||||
ret = handle_hyp_req_split(vcpu, hyp_req);
|
||||
break;
|
||||
default:
|
||||
pr_warn("Unknown kvm_hyp_req type: %d\n", hyp_req->type);
|
||||
ret = -EINVAL;
|
||||
|
@@ -63,6 +63,7 @@ int __pkvm_host_unuse_dma(u64 phys_addr, size_t size);
|
||||
int __pkvm_guest_stage2_snapshot(struct kvm_pgtable_snapshot *snap, struct pkvm_hyp_vm *vm);
|
||||
int __pkvm_host_stage2_snapshot(struct kvm_pgtable_snapshot *snap);
|
||||
int __pkvm_host_lazy_pte(u64 pfn, u64 nr_pages, bool enable);
|
||||
int __pkvm_host_split_guest(u64 pfn, u64 gfn, u64 size, struct pkvm_hyp_vcpu *vcpu);
|
||||
|
||||
bool addr_is_memory(phys_addr_t phys);
|
||||
int host_stage2_idmap_locked(phys_addr_t addr, u64 size,
|
||||
|
@@ -556,7 +556,7 @@ void *hyp_alloc(size_t size)
|
||||
unsigned long chunk_addr;
|
||||
int missing_map, ret = 0;
|
||||
|
||||
size = ALIGN(size, MIN_ALLOC);
|
||||
size = ALIGN(size ?: MIN_ALLOC, MIN_ALLOC);
|
||||
|
||||
hyp_spin_lock(&allocator->lock);
|
||||
|
||||
|
@@ -1073,6 +1073,27 @@ out:
|
||||
cpu_reg(host_ctxt, 1) = ret;
|
||||
}
|
||||
|
||||
static void handle___pkvm_host_split_guest(struct kvm_cpu_context *host_ctxt)
|
||||
{
|
||||
DECLARE_REG(u64, pfn, host_ctxt, 1);
|
||||
DECLARE_REG(u64, gfn, host_ctxt, 2);
|
||||
DECLARE_REG(u64, size, host_ctxt, 3);
|
||||
struct pkvm_hyp_vcpu *hyp_vcpu;
|
||||
int ret = -EINVAL;
|
||||
|
||||
if (!is_protected_kvm_enabled())
|
||||
goto out;
|
||||
|
||||
hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
|
||||
if (!hyp_vcpu)
|
||||
goto out;
|
||||
|
||||
ret = __pkvm_host_split_guest(pfn, gfn, size, hyp_vcpu);
|
||||
|
||||
out:
|
||||
cpu_reg(host_ctxt, 1) = ret;
|
||||
}
|
||||
|
||||
static void handle___kvm_adjust_pc(struct kvm_cpu_context *host_ctxt)
|
||||
{
|
||||
struct pkvm_hyp_vcpu *hyp_vcpu;
|
||||
@@ -1618,6 +1639,7 @@ static const hcall_t host_hcall[] = {
|
||||
HANDLE_FUNC(__pkvm_relax_perms),
|
||||
HANDLE_FUNC(__pkvm_wrprotect),
|
||||
HANDLE_FUNC(__pkvm_dirty_log),
|
||||
HANDLE_FUNC(__pkvm_host_split_guest),
|
||||
HANDLE_FUNC(__pkvm_tlb_flush_vmid),
|
||||
HANDLE_FUNC(__kvm_adjust_pc),
|
||||
HANDLE_FUNC(__kvm_vcpu_run),
|
||||
|
@@ -387,6 +387,10 @@ static int relinquish_walker(const struct kvm_pgtable_visit_ctx *ctx,
|
||||
if (!kvm_pte_valid(pte))
|
||||
return 0;
|
||||
|
||||
/* We don't support splitting non-leaf mappings */
|
||||
if (ctx->level != (KVM_PGTABLE_MAX_LEVELS - 1))
|
||||
return -E2BIG;
|
||||
|
||||
state = pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte));
|
||||
if (state != data->expected_state)
|
||||
return -EPERM;
|
||||
@@ -433,8 +437,7 @@ int __pkvm_guest_relinquish_to_host(struct pkvm_hyp_vcpu *vcpu,
|
||||
goto end;
|
||||
|
||||
/* Zap the guest stage2 pte and return ownership to the host */
|
||||
ret = kvm_pgtable_stage2_annotate(&vm->pgt, ipa, PAGE_SIZE,
|
||||
&vcpu->vcpu.arch.stage2_mc, 0);
|
||||
ret = kvm_pgtable_stage2_unmap(&vm->pgt, ipa, PAGE_SIZE);
|
||||
if (ret)
|
||||
goto end;
|
||||
|
||||
@@ -2760,6 +2763,30 @@ unlock:
|
||||
|
||||
}
|
||||
|
||||
int __pkvm_host_split_guest(u64 pfn, u64 gfn, u64 size, struct pkvm_hyp_vcpu *vcpu)
|
||||
{
|
||||
struct kvm_hyp_memcache *mc = &vcpu->vcpu.arch.stage2_mc;
|
||||
struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
|
||||
u64 ipa = hyp_pfn_to_phys(gfn);
|
||||
int ret;
|
||||
|
||||
if (size != PMD_SIZE)
|
||||
return -EINVAL;
|
||||
|
||||
guest_lock_component(vm);
|
||||
|
||||
/*
|
||||
* stage2_split() already checks the existing mapping is valid and PMD-level.
|
||||
* No other check is necessary.
|
||||
*/
|
||||
|
||||
ret = kvm_pgtable_stage2_split(&vm->pgt, ipa, size, mc);
|
||||
|
||||
guest_unlock_component(vm);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int __pkvm_host_donate_guest(struct pkvm_hyp_vcpu *vcpu, u64 pfn, u64 gfn,
|
||||
u64 nr_pages)
|
||||
{
|
||||
|
@@ -702,16 +702,13 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu,
|
||||
if (ret)
|
||||
goto done;
|
||||
|
||||
ret = pkvm_vcpu_init_psci(hyp_vcpu);
|
||||
if (ret)
|
||||
goto done;
|
||||
|
||||
if (test_bit(KVM_ARM_VCPU_SVE, hyp_vcpu->vcpu.arch.features)) {
|
||||
ret = init_pkvm_hyp_vcpu_sve(hyp_vcpu, host_vcpu);
|
||||
if (ret)
|
||||
goto done;
|
||||
}
|
||||
|
||||
WARN_ON(pkvm_vcpu_init_psci(hyp_vcpu));
|
||||
pkvm_vcpu_init_traps(hyp_vcpu);
|
||||
kvm_reset_pvm_sys_regs(&hyp_vcpu->vcpu);
|
||||
done:
|
||||
@@ -1588,9 +1585,19 @@ static bool pkvm_memrelinquish_call(struct pkvm_hyp_vcpu *hyp_vcpu,
|
||||
goto out_guest_err;
|
||||
|
||||
ret = __pkvm_guest_relinquish_to_host(hyp_vcpu, ipa, &pa);
|
||||
if (ret == -ENOMEM) {
|
||||
if (pkvm_handle_empty_memcache(hyp_vcpu, exit_code))
|
||||
if (ret == -E2BIG) {
|
||||
struct kvm_hyp_req *req = pkvm_hyp_req_reserve(hyp_vcpu, KVM_HYP_REQ_TYPE_SPLIT);
|
||||
|
||||
if (!req) {
|
||||
ret = -ENOMEM;
|
||||
goto out_guest_err;
|
||||
}
|
||||
|
||||
req->split.guest_ipa = ALIGN_DOWN(ipa, PMD_SIZE);
|
||||
req->split.size = PMD_SIZE;
|
||||
|
||||
write_sysreg_el2(read_sysreg_el2(SYS_ELR) - 4, SYS_ELR);
|
||||
*exit_code = ARM_EXCEPTION_HYP_REQ;
|
||||
|
||||
return false;
|
||||
} else if (ret) {
|
||||
|
@@ -1769,13 +1769,49 @@ static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
|
||||
struct kvm_mmu_memory_cache *mc)
|
||||
static int pkvm_stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
|
||||
enum kvm_pgtable_walk_flags visit)
|
||||
{
|
||||
struct stage2_map_data *data = ctx->arg;
|
||||
struct kvm_pgtable *pgt = data->mmu->pgt;
|
||||
struct kvm_hyp_memcache *mc = data->memcache;
|
||||
enum kvm_pgtable_prot prot;
|
||||
kvm_pte_t pte = ctx->old;
|
||||
kvm_pte_t *childp;
|
||||
|
||||
if (ctx->level == KVM_PGTABLE_MAX_LEVELS - 1)
|
||||
return 0;
|
||||
|
||||
/* We can only split PMD-level blocks */
|
||||
if (!kvm_pte_valid(pte) || ctx->level != KVM_PGTABLE_MAX_LEVELS - 2)
|
||||
return -EINVAL;
|
||||
|
||||
prot = kvm_pgtable_stage2_pte_prot(pte);
|
||||
childp = kvm_pgtable_stage2_create_unlinked(pgt, kvm_pte_to_phys(pte),
|
||||
ctx->level, prot, mc, true);
|
||||
if (IS_ERR(childp))
|
||||
return PTR_ERR(childp);
|
||||
|
||||
WARN_ON(!stage2_try_break_pte(ctx, data->mmu));
|
||||
|
||||
stage2_make_pte(ctx, kvm_init_table_pte(childp, ctx->mm_ops));
|
||||
dsb(ishst);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, void *mc)
|
||||
{
|
||||
struct stage2_map_data data = {
|
||||
.mmu = pgt->mmu,
|
||||
.memcache = mc,
|
||||
};
|
||||
struct kvm_pgtable_walker walker = {
|
||||
.cb = stage2_split_walker,
|
||||
.cb = static_branch_unlikely(&kvm_protected_mode_initialized) ?
|
||||
pkvm_stage2_split_walker : stage2_split_walker,
|
||||
.arg = static_branch_unlikely(&kvm_protected_mode_initialized) ?
|
||||
&data : mc,
|
||||
.flags = KVM_PGTABLE_WALK_LEAF,
|
||||
.arg = mc,
|
||||
};
|
||||
|
||||
return kvm_pgtable_walk(pgt, addr, size, &walker);
|
||||
|
@@ -6,11 +6,11 @@
|
||||
|
||||
#include <linux/cma.h>
|
||||
#include <linux/dma-map-ops.h>
|
||||
#include <linux/maple_tree.h>
|
||||
#include <linux/mman.h>
|
||||
#include <linux/kvm_host.h>
|
||||
#include <linux/io.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/interval_tree_generic.h>
|
||||
#include <linux/sched/signal.h>
|
||||
#include <trace/events/kvm.h>
|
||||
#include <asm/pgalloc.h>
|
||||
@@ -291,6 +291,20 @@ static void invalidate_icache_guest_page(void *va, size_t size)
|
||||
__invalidate_icache_guest_page(va, size);
|
||||
}
|
||||
|
||||
static u64 __pinned_page_start(struct kvm_pinned_page *ppage)
|
||||
{
|
||||
return ppage->ipa;
|
||||
}
|
||||
|
||||
static u64 __pinned_page_end(struct kvm_pinned_page *ppage)
|
||||
{
|
||||
return ppage->ipa + (1 << (ppage->order + PAGE_SHIFT)) - 1;
|
||||
}
|
||||
|
||||
INTERVAL_TREE_DEFINE(struct kvm_pinned_page, node, u64, __subtree_last,
|
||||
__pinned_page_start, __pinned_page_end, /* empty */,
|
||||
kvm_pinned_pages);
|
||||
|
||||
static int __pkvm_unmap_guest_call(u64 pfn, u64 gfn, u8 order, void *args)
|
||||
{
|
||||
struct kvm *kvm = args;
|
||||
@@ -312,7 +326,7 @@ static int pkvm_unmap_guest(struct kvm *kvm, struct kvm_pinned_page *ppage)
|
||||
* no update needed from here.
|
||||
*/
|
||||
unpin_user_pages(&ppage->page, 1);
|
||||
mtree_erase(&kvm->arch.pkvm.pinned_pages, ppage->ipa);
|
||||
kvm_pinned_pages_remove(ppage, &kvm->arch.pkvm.pinned_pages);
|
||||
kfree(ppage);
|
||||
|
||||
return 0;
|
||||
@@ -320,17 +334,12 @@ static int pkvm_unmap_guest(struct kvm *kvm, struct kvm_pinned_page *ppage)
|
||||
|
||||
static int pkvm_unmap_range(struct kvm *kvm, u64 start, u64 end)
|
||||
{
|
||||
struct kvm_pinned_page *ppage, *tmp;
|
||||
struct mm_struct *mm = kvm->mm;
|
||||
unsigned long index = start;
|
||||
unsigned long cnt = 0;
|
||||
void *entry;
|
||||
int ret = 0;
|
||||
|
||||
mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, end - 1) {
|
||||
struct kvm_pinned_page *ppage = entry;
|
||||
|
||||
if (ppage == KVM_DUMMY_PPAGE)
|
||||
continue;
|
||||
for_ppage_node_in_range(kvm, start, end, ppage, tmp) {
|
||||
ret = pkvm_unmap_guest(kvm, ppage);
|
||||
if (ret)
|
||||
break;
|
||||
@@ -418,8 +427,7 @@ static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 si
|
||||
|
||||
static void pkvm_stage2_flush(struct kvm *kvm)
|
||||
{
|
||||
unsigned long index = 0;
|
||||
void *entry;
|
||||
struct kvm_pinned_page *ppage, *tmp;
|
||||
|
||||
/*
|
||||
* Contrary to stage2_apply_range(), we don't need to check
|
||||
@@ -427,11 +435,7 @@ static void pkvm_stage2_flush(struct kvm *kvm)
|
||||
* from a vcpu thread, and the list is only ever freed on VM
|
||||
* destroy (which only occurs when all vcpu are gone).
|
||||
*/
|
||||
mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, ULONG_MAX) {
|
||||
struct kvm_pinned_page *ppage = entry;
|
||||
|
||||
if (ppage == KVM_DUMMY_PPAGE)
|
||||
continue;
|
||||
for_ppage_node_in_range(kvm, 0, ULONG_MAX, ppage, tmp) {
|
||||
__clean_dcache_guest_page(page_address(ppage->page), PAGE_SIZE);
|
||||
cond_resched_rwlock_write(&kvm->mmu_lock);
|
||||
}
|
||||
@@ -1014,7 +1018,6 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
|
||||
mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
|
||||
mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
|
||||
kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
|
||||
mt_init_flags(&kvm->arch.pkvm.pinned_pages, MT_FLAGS_USE_RCU);
|
||||
mmu->arch = &kvm->arch;
|
||||
|
||||
if (is_protected_kvm_enabled())
|
||||
@@ -1293,18 +1296,13 @@ static int __pkvm_wrprotect_call(u64 pfn, u64 gfn, u8 order, void *args)
|
||||
|
||||
static int pkvm_wp_range(struct kvm *kvm, u64 start, u64 end)
|
||||
{
|
||||
unsigned long index = start;
|
||||
void *entry;
|
||||
struct kvm_pinned_page *ppage, *tmp;
|
||||
|
||||
mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, end - 1) {
|
||||
struct kvm_pinned_page *ppage = entry;
|
||||
for_ppage_node_in_range(kvm, start, end, ppage, tmp) {
|
||||
int ret;
|
||||
|
||||
if (ppage == KVM_DUMMY_PPAGE)
|
||||
continue;
|
||||
ret = pkvm_call_hyp_nvhe_ppage(ppage, __pkvm_wrprotect_call,
|
||||
kvm, false);
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
@@ -1630,28 +1628,9 @@ static int pkvm_host_map_guest(u64 pfn, u64 gfn, u64 nr_pages,
|
||||
return (ret == -EPERM) ? -EAGAIN : ret;
|
||||
}
|
||||
|
||||
static struct kvm_pinned_page *
|
||||
find_ppage_or_above(struct kvm *kvm, phys_addr_t ipa)
|
||||
{
|
||||
unsigned long index = ipa;
|
||||
void *entry;
|
||||
|
||||
mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, ULONG_MAX) {
|
||||
if (entry == KVM_DUMMY_PPAGE)
|
||||
continue;
|
||||
return entry;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct kvm_pinned_page *find_ppage(struct kvm *kvm, u64 ipa)
|
||||
{
|
||||
struct kvm_pinned_page *ppage;
|
||||
unsigned long index = ipa;
|
||||
|
||||
ppage = mt_find(&kvm->arch.pkvm.pinned_pages, &index, ipa + PAGE_SIZE - 1);
|
||||
return ppage == KVM_DUMMY_PPAGE ? NULL : ppage;
|
||||
return kvm_pinned_pages_iter_first(&kvm->arch.pkvm.pinned_pages, ipa, ipa + PAGE_SIZE - 1);
|
||||
}
|
||||
|
||||
static int __pkvm_relax_perms_call(u64 pfn, u64 gfn, u8 order, void *args)
|
||||
@@ -1707,11 +1686,10 @@ static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t *fault_ipa,
|
||||
{
|
||||
unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE;
|
||||
struct kvm_hyp_memcache *hyp_memcache = &vcpu->arch.stage2_mc;
|
||||
unsigned long index, pmd_offset, page_size, end;
|
||||
unsigned long page_size = PAGE_SIZE;
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct kvm_pinned_page *ppage;
|
||||
struct kvm *kvm = vcpu->kvm;
|
||||
struct maple_tree *mt = &kvm->arch.pkvm.pinned_pages;
|
||||
int ret, nr_pages;
|
||||
struct page *page;
|
||||
u64 pfn;
|
||||
@@ -1760,66 +1738,49 @@ static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t *fault_ipa,
|
||||
}
|
||||
|
||||
pfn = page_to_pfn(page);
|
||||
pmd_offset = *fault_ipa & (PMD_SIZE - 1);
|
||||
page_size = transparent_hugepage_adjust(kvm, memslot,
|
||||
hva, &pfn,
|
||||
fault_ipa);
|
||||
page = pfn_to_page(pfn);
|
||||
|
||||
retry:
|
||||
if (size)
|
||||
*size = page_size;
|
||||
read_lock(&kvm->mmu_lock);
|
||||
if (!kvm_pinned_pages_iter_first(&vcpu->kvm->arch.pkvm.pinned_pages,
|
||||
ALIGN_DOWN(*fault_ipa, PMD_SIZE),
|
||||
ALIGN(*fault_ipa + 1, PMD_SIZE) - 1))
|
||||
page_size = transparent_hugepage_adjust(kvm, memslot, hva, &pfn, fault_ipa);
|
||||
|
||||
/*
|
||||
* We take the risk of racing with another vCPU, but sync will be restored by the
|
||||
* host_map_guest HVC
|
||||
*/
|
||||
read_unlock(&kvm->mmu_lock);
|
||||
|
||||
page = pfn_to_page(pfn);
|
||||
|
||||
ret = account_locked_vm(mm, page_size >> PAGE_SHIFT, true);
|
||||
if (ret)
|
||||
goto unpin;
|
||||
|
||||
index = *fault_ipa;
|
||||
end = index + page_size - 1;
|
||||
ppage->page = page;
|
||||
ppage->ipa = *fault_ipa;
|
||||
ppage->order = get_order(page_size);
|
||||
ppage->pins = 1 << ppage->order;
|
||||
|
||||
/*
|
||||
* If we already have a mapping in the middle of the THP, we have no
|
||||
* other choice than enforcing PAGE_SIZE for pkvm_host_map_guest() to
|
||||
* succeed.
|
||||
*/
|
||||
if (page_size > PAGE_SIZE && mt_find(mt, &index, end)) {
|
||||
*fault_ipa += pmd_offset;
|
||||
pfn += pmd_offset >> PAGE_SHIFT;
|
||||
page = pfn_to_page(pfn);
|
||||
account_locked_vm(mm, page_size >> PAGE_SHIFT, false);
|
||||
page_size = PAGE_SIZE;
|
||||
goto retry;
|
||||
}
|
||||
|
||||
/* Reserve space in the mtree */
|
||||
ret = mtree_insert_range(mt, index, end, KVM_DUMMY_PPAGE, GFP_KERNEL);
|
||||
if (ret) {
|
||||
if (ret == -EEXIST)
|
||||
ret = 0;
|
||||
goto dec_account;
|
||||
}
|
||||
|
||||
write_lock(&kvm->mmu_lock);
|
||||
ret = pkvm_host_map_guest(pfn, *fault_ipa >> PAGE_SHIFT,
|
||||
page_size >> PAGE_SHIFT, KVM_PGTABLE_PROT_R);
|
||||
if (ret) {
|
||||
if (WARN_ON(ret == -EAGAIN))
|
||||
if (ret == -EAGAIN)
|
||||
ret = 0;
|
||||
|
||||
goto err_unlock;
|
||||
}
|
||||
WARN_ON(mtree_store_range(mt, index, end, ppage, GFP_ATOMIC));
|
||||
kvm_pinned_pages_insert(ppage, &kvm->arch.pkvm.pinned_pages);
|
||||
write_unlock(&kvm->mmu_lock);
|
||||
|
||||
if (size)
|
||||
*size = page_size;
|
||||
|
||||
return 0;
|
||||
|
||||
err_unlock:
|
||||
write_unlock(&kvm->mmu_lock);
|
||||
dec_account:
|
||||
account_locked_vm(mm, page_size >> PAGE_SHIFT, false);
|
||||
unpin:
|
||||
unpin_user_pages(&page, 1);
|
||||
@@ -1847,13 +1808,13 @@ int pkvm_mem_abort_range(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, size_t si
|
||||
idx = srcu_read_lock(&vcpu->kvm->srcu);
|
||||
|
||||
read_lock(&vcpu->kvm->mmu_lock);
|
||||
ppage = find_ppage_or_above(vcpu->kvm, fault_ipa);
|
||||
ppage = kvm_pinned_pages_iter_first(&vcpu->kvm->arch.pkvm.pinned_pages,
|
||||
fault_ipa, ipa_end);
|
||||
|
||||
while (fault_ipa < ipa_end) {
|
||||
if (ppage && ppage != KVM_DUMMY_PPAGE && ppage->ipa == fault_ipa) {
|
||||
if (ppage && ppage->ipa == fault_ipa) {
|
||||
page_size = PAGE_SIZE << ppage->order;
|
||||
ppage = mt_next(&vcpu->kvm->arch.pkvm.pinned_pages,
|
||||
ppage->ipa, ULONG_MAX);
|
||||
ppage = kvm_pinned_pages_iter_next(ppage, fault_ipa, ipa_end);
|
||||
} else {
|
||||
gfn_t gfn = gpa_to_gfn(fault_ipa);
|
||||
struct kvm_memory_slot *memslot;
|
||||
@@ -1877,7 +1838,8 @@ int pkvm_mem_abort_range(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, size_t si
|
||||
* We had to release the mmu_lock so let's update the
|
||||
* reference.
|
||||
*/
|
||||
ppage = find_ppage_or_above(vcpu->kvm, fault_ipa + page_size);
|
||||
ppage = kvm_pinned_pages_iter_first(&vcpu->kvm->arch.pkvm.pinned_pages,
|
||||
fault_ipa + PAGE_SIZE, ipa_end);
|
||||
}
|
||||
|
||||
fault_ipa += page_size;
|
||||
@@ -1889,6 +1851,162 @@ end:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int __pkvm_pin_user_pages(struct kvm *kvm, struct kvm_memory_slot *memslot,
|
||||
u64 gfn, u64 nr_pages, struct page ***__pages)
|
||||
{
|
||||
unsigned long hva = gfn_to_hva_memslot_prot(memslot, gfn, NULL);
|
||||
unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE;
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct page **pages;
|
||||
long ret;
|
||||
int p;
|
||||
|
||||
pages = kmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL);
|
||||
if (!pages)
|
||||
return -ENOMEM;
|
||||
|
||||
mmap_read_lock(mm);
|
||||
ret = pin_user_pages(hva, nr_pages, flags, pages);
|
||||
mmap_read_unlock(mm);
|
||||
|
||||
if (ret == -EHWPOISON) {
|
||||
kvm_send_hwpoison_signal(hva, PAGE_SHIFT);
|
||||
goto err_free_pages;
|
||||
} else if (ret == -EFAULT) {
|
||||
/* Will try MMIO map */
|
||||
ret = -EREMOTEIO;
|
||||
goto err_free_pages;
|
||||
} else if (ret < 0) {
|
||||
ret = -EFAULT;
|
||||
goto err_free_pages;
|
||||
} else if (ret != nr_pages) {
|
||||
nr_pages = ret;
|
||||
ret = -EFAULT;
|
||||
goto err_unpin_pages;
|
||||
}
|
||||
|
||||
/* See PageSwapBacked() in pkvm_mem_abort() */
|
||||
for (p = 0; p < nr_pages; p++) {
|
||||
if (!folio_test_swapbacked(page_folio(pages[p]))) {
|
||||
ret = -EIO;
|
||||
goto err_unpin_pages;
|
||||
}
|
||||
}
|
||||
|
||||
*__pages = pages;
|
||||
return 0;
|
||||
|
||||
err_unpin_pages:
|
||||
unpin_user_pages(pages, nr_pages);
|
||||
err_free_pages:
|
||||
kfree(pages);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Splitting is only expected on the back of a relinquish guest HVC in the pKVM case, while
|
||||
* pkvm_pgtable_stage2_split() can be called with dirty logging.
|
||||
*/
|
||||
int __pkvm_pgtable_stage2_split(struct kvm_vcpu *vcpu, phys_addr_t ipa, size_t size)
|
||||
{
|
||||
struct list_head ppage_prealloc = LIST_HEAD_INIT(ppage_prealloc);
|
||||
struct kvm_hyp_memcache *hyp_memcache = &vcpu->arch.stage2_mc;
|
||||
struct kvm_pinned_page *ppage, *tmp;
|
||||
struct kvm_memory_slot *memslot;
|
||||
struct kvm *kvm = vcpu->kvm;
|
||||
int idx, p, ret, nr_pages;
|
||||
struct page **pages;
|
||||
kvm_pfn_t pfn;
|
||||
gfn_t gfn;
|
||||
|
||||
if (!IS_ALIGNED(ipa, PMD_SIZE) || size != PMD_SIZE)
|
||||
return -EINVAL;
|
||||
|
||||
if (!hyp_memcache->nr_pages) {
|
||||
ret = topup_hyp_memcache(hyp_memcache, 1, 0);
|
||||
if (ret)
|
||||
return -ENOMEM;
|
||||
|
||||
atomic64_add(PAGE_SIZE, &kvm->stat.protected_hyp_mem);
|
||||
atomic64_add(PAGE_SIZE, &kvm->stat.protected_pgtable_mem);
|
||||
}
|
||||
|
||||
/* We already have 1 pin on the Huge Page */
|
||||
nr_pages = (size >> PAGE_SHIFT) - 1;
|
||||
gfn = (ipa >> PAGE_SHIFT) + 1;
|
||||
|
||||
/* Pre-allocate kvm_pinned_page before acquiring the mmu_lock */
|
||||
for (p = 0; p < nr_pages; p++) {
|
||||
ppage = kzalloc(sizeof(*ppage), GFP_KERNEL_ACCOUNT);
|
||||
if (!ppage) {
|
||||
ret = -ENOMEM;
|
||||
goto free_pinned_pages;
|
||||
}
|
||||
list_add(&ppage->list_node, &ppage_prealloc);
|
||||
}
|
||||
|
||||
idx = srcu_read_lock(&vcpu->kvm->srcu);
|
||||
memslot = gfn_to_memslot(vcpu->kvm, gfn);
|
||||
ret = __pkvm_pin_user_pages(kvm, memslot, gfn, nr_pages, &pages);
|
||||
if (ret)
|
||||
goto unlock_srcu;
|
||||
|
||||
write_lock(&kvm->mmu_lock);
|
||||
|
||||
ppage = find_ppage(kvm, ipa);
|
||||
if (!ppage) {
|
||||
ret = -EPERM;
|
||||
goto end;
|
||||
} else if (!ppage->order) {
|
||||
ret = 0;
|
||||
goto end;
|
||||
}
|
||||
|
||||
ret = kvm_call_hyp_nvhe(__pkvm_host_split_guest, page_to_pfn(ppage->page),
|
||||
ipa >> PAGE_SHIFT, size);
|
||||
if (ret)
|
||||
goto end;
|
||||
|
||||
ppage->order = 0;
|
||||
ppage->pins = 1;
|
||||
|
||||
pfn = page_to_pfn(ppage->page) + 1;
|
||||
ipa = ipa + PAGE_SIZE;
|
||||
while (nr_pages--) {
|
||||
/* Pop a ppage from the pre-allocated list */
|
||||
ppage = list_first_entry(&ppage_prealloc, struct kvm_pinned_page, list_node);
|
||||
list_del_init(&ppage->list_node);
|
||||
|
||||
ppage->page = pfn_to_page(pfn);
|
||||
ppage->ipa = ipa;
|
||||
ppage->order = 0;
|
||||
ppage->pins = 1;
|
||||
kvm_pinned_pages_insert(ppage, &kvm->arch.pkvm.pinned_pages);
|
||||
|
||||
pfn += 1;
|
||||
ipa += PAGE_SIZE;
|
||||
}
|
||||
|
||||
end:
|
||||
write_unlock(&kvm->mmu_lock);
|
||||
|
||||
if (ret)
|
||||
unpin_user_pages(pages, nr_pages);
|
||||
kfree(pages);
|
||||
|
||||
unlock_srcu:
|
||||
srcu_read_unlock(&vcpu->kvm->srcu, idx);
|
||||
|
||||
free_pinned_pages:
|
||||
/* Free unused pre-allocated kvm_pinned_page */
|
||||
list_for_each_entry_safe(ppage, tmp, &ppage_prealloc, list_node) {
|
||||
list_del(&ppage->list_node);
|
||||
kfree(ppage);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
|
||||
struct kvm_memory_slot *memslot, unsigned long hva,
|
||||
unsigned long fault_status)
|
||||
|
@@ -319,21 +319,17 @@ static int __reclaim_dying_guest_page_call(u64 pfn, u64 gfn, u8 order, void *arg
|
||||
|
||||
static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm)
|
||||
{
|
||||
struct kvm_pinned_page *tmp, *ppage;
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct kvm_pinned_page *ppage;
|
||||
struct kvm_vcpu *host_vcpu;
|
||||
unsigned long idx, ipa = 0;
|
||||
unsigned long idx;
|
||||
|
||||
if (!host_kvm->arch.pkvm.handle)
|
||||
goto out_free;
|
||||
|
||||
WARN_ON(kvm_call_hyp_nvhe(__pkvm_start_teardown_vm, host_kvm->arch.pkvm.handle));
|
||||
|
||||
mt_clear_in_rcu(&host_kvm->arch.pkvm.pinned_pages);
|
||||
|
||||
mt_for_each(&host_kvm->arch.pkvm.pinned_pages, ppage, ipa, ULONG_MAX) {
|
||||
if (WARN_ON(ppage == KVM_DUMMY_PPAGE))
|
||||
continue;
|
||||
for_ppage_node_in_range(host_kvm, 0, ULONG_MAX, ppage, tmp) {
|
||||
WARN_ON(pkvm_call_hyp_nvhe_ppage(ppage,
|
||||
__reclaim_dying_guest_page_call,
|
||||
host_kvm, true));
|
||||
@@ -341,9 +337,9 @@ static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm)
|
||||
|
||||
account_locked_vm(mm, 1, false);
|
||||
unpin_user_pages_dirty_lock(&ppage->page, 1, host_kvm->arch.pkvm.enabled);
|
||||
kvm_pinned_pages_remove(ppage, &host_kvm->arch.pkvm.pinned_pages);
|
||||
kfree(ppage);
|
||||
}
|
||||
mtree_destroy(&host_kvm->arch.pkvm.pinned_pages);
|
||||
|
||||
WARN_ON(kvm_call_hyp_nvhe(__pkvm_finalize_teardown_vm, host_kvm->arch.pkvm.handle));
|
||||
|
||||
@@ -538,21 +534,21 @@ void pkvm_host_reclaim_page(struct kvm *host_kvm, phys_addr_t ipa)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct kvm_pinned_page *ppage;
|
||||
unsigned long index = ipa;
|
||||
u16 pins;
|
||||
|
||||
write_lock(&host_kvm->mmu_lock);
|
||||
ppage = mt_find(&host_kvm->arch.pkvm.pinned_pages, &index,
|
||||
index + PAGE_SIZE - 1);
|
||||
if (ppage && ppage != KVM_DUMMY_PPAGE) {
|
||||
ppage = kvm_pinned_pages_iter_first(&host_kvm->arch.pkvm.pinned_pages,
|
||||
ipa, ipa + PAGE_SIZE - 1);
|
||||
if (ppage) {
|
||||
WARN_ON_ONCE(ppage->pins != 1);
|
||||
|
||||
if (ppage->pins)
|
||||
ppage->pins--;
|
||||
else
|
||||
WARN_ON(1);
|
||||
|
||||
pins = ppage->pins;
|
||||
if (!pins)
|
||||
mtree_erase(&host_kvm->arch.pkvm.pinned_pages, ipa);
|
||||
kvm_pinned_pages_remove(ppage,
|
||||
&host_kvm->arch.pkvm.pinned_pages);
|
||||
}
|
||||
write_unlock(&host_kvm->mmu_lock);
|
||||
|
||||
|
@@ -672,6 +672,7 @@ CONFIG_CRYPTO_ZSTD=y
|
||||
CONFIG_CRYPTO_ANSI_CPRNG=y
|
||||
CONFIG_CRYPTO_AES_NI_INTEL=y
|
||||
CONFIG_CRYPTO_POLYVAL_CLMUL_NI=y
|
||||
CONFIG_CRYPTO_SHA1_SSSE3=y
|
||||
CONFIG_CRYPTO_SHA256_SSSE3=y
|
||||
CONFIG_CRYPTO_SHA512_SSSE3=y
|
||||
CONFIG_CRC_CCITT=y
|
||||
|
@@ -14,12 +14,6 @@ CONFIG_UCLAMP_TASK=y
|
||||
CONFIG_UCLAMP_BUCKETS_COUNT=20
|
||||
CONFIG_CGROUPS=y
|
||||
CONFIG_MEMCG=y
|
||||
CONFIG_BLK_CGROUP=y
|
||||
CONFIG_CGROUP_SCHED=y
|
||||
CONFIG_UCLAMP_TASK_GROUP=y
|
||||
CONFIG_CGROUP_FREEZER=y
|
||||
CONFIG_CPUSETS=y
|
||||
CONFIG_CGROUP_CPUACCT=y
|
||||
# CONFIG_RD_BZIP2 is not set
|
||||
# CONFIG_RD_LZMA is not set
|
||||
# CONFIG_RD_XZ is not set
|
||||
@@ -47,7 +41,6 @@ CONFIG_CPU_FREQ_GOV_POWERSAVE=y
|
||||
CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y
|
||||
CONFIG_JUMP_LABEL=y
|
||||
# CONFIG_BLOCK_LEGACY_AUTOLOAD is not set
|
||||
CONFIG_BLK_CGROUP_IOCOST=y
|
||||
CONFIG_PARTITION_ADVANCED=y
|
||||
# CONFIG_MSDOS_PARTITION is not set
|
||||
# CONFIG_MQ_IOSCHED_DEADLINE is not set
|
||||
@@ -209,6 +202,7 @@ CONFIG_CRYPTO_HCTR2=y
|
||||
CONFIG_CRYPTO_LZO=y
|
||||
CONFIG_CRYPTO_AES_NI_INTEL=y
|
||||
CONFIG_CRYPTO_POLYVAL_CLMUL_NI=y
|
||||
CONFIG_CRYPTO_SHA1_SSSE3=y
|
||||
CONFIG_CRYPTO_SHA256_SSSE3=y
|
||||
CONFIG_CRYPTO_SHA512_SSSE3=y
|
||||
CONFIG_PRINTK_TIME=y
|
||||
|
@@ -6642,10 +6642,10 @@ static void print_binder_transaction_ilocked(struct seq_file *m,
|
||||
}
|
||||
|
||||
static void print_binder_work_ilocked(struct seq_file *m,
|
||||
struct binder_proc *proc,
|
||||
const char *prefix,
|
||||
const char *transaction_prefix,
|
||||
struct binder_work *w)
|
||||
struct binder_proc *proc,
|
||||
const char *prefix,
|
||||
const char *transaction_prefix,
|
||||
struct binder_work *w, bool hash_ptrs)
|
||||
{
|
||||
struct binder_node *node;
|
||||
struct binder_transaction *t;
|
||||
@@ -6668,9 +6668,15 @@ static void print_binder_work_ilocked(struct seq_file *m,
|
||||
break;
|
||||
case BINDER_WORK_NODE:
|
||||
node = container_of(w, struct binder_node, work);
|
||||
seq_printf(m, "%snode work %d: u%016llx c%016llx\n",
|
||||
prefix, node->debug_id,
|
||||
(u64)node->ptr, (u64)node->cookie);
|
||||
if (hash_ptrs)
|
||||
seq_printf(m, "%snode work %d: u%p c%p\n",
|
||||
prefix, node->debug_id,
|
||||
(void *)(long)node->ptr,
|
||||
(void *)(long)node->cookie);
|
||||
else
|
||||
seq_printf(m, "%snode work %d: u%016llx c%016llx\n",
|
||||
prefix, node->debug_id,
|
||||
(u64)node->ptr, (u64)node->cookie);
|
||||
break;
|
||||
case BINDER_WORK_DEAD_BINDER:
|
||||
seq_printf(m, "%shas dead binder\n", prefix);
|
||||
@@ -6695,7 +6701,7 @@ static void print_binder_work_ilocked(struct seq_file *m,
|
||||
|
||||
static void print_binder_thread_ilocked(struct seq_file *m,
|
||||
struct binder_thread *thread,
|
||||
int print_always)
|
||||
bool print_always, bool hash_ptrs)
|
||||
{
|
||||
struct binder_transaction *t;
|
||||
struct binder_work *w;
|
||||
@@ -6725,14 +6731,16 @@ static void print_binder_thread_ilocked(struct seq_file *m,
|
||||
}
|
||||
list_for_each_entry(w, &thread->todo, entry) {
|
||||
print_binder_work_ilocked(m, thread->proc, " ",
|
||||
" pending transaction", w);
|
||||
" pending transaction",
|
||||
w, hash_ptrs);
|
||||
}
|
||||
if (!print_always && m->count == header_pos)
|
||||
m->count = start_pos;
|
||||
}
|
||||
|
||||
static void print_binder_node_nilocked(struct seq_file *m,
|
||||
struct binder_node *node)
|
||||
struct binder_node *node,
|
||||
bool hash_ptrs)
|
||||
{
|
||||
struct binder_ref *ref;
|
||||
struct binder_work *w;
|
||||
@@ -6742,8 +6750,13 @@ static void print_binder_node_nilocked(struct seq_file *m,
|
||||
hlist_for_each_entry(ref, &node->refs, node_entry)
|
||||
count++;
|
||||
|
||||
seq_printf(m, " node %d: u%016llx c%016llx pri %d:%d hs %d hw %d ls %d lw %d is %d iw %d tr %d",
|
||||
node->debug_id, (u64)node->ptr, (u64)node->cookie,
|
||||
if (hash_ptrs)
|
||||
seq_printf(m, " node %d: u%p c%p", node->debug_id,
|
||||
(void *)(long)node->ptr, (void *)(long)node->cookie);
|
||||
else
|
||||
seq_printf(m, " node %d: u%016llx c%016llx", node->debug_id,
|
||||
(u64)node->ptr, (u64)node->cookie);
|
||||
seq_printf(m, " pri %d:%d hs %d hw %d ls %d lw %d is %d iw %d tr %d",
|
||||
node->sched_policy, node->min_priority,
|
||||
node->has_strong_ref, node->has_weak_ref,
|
||||
node->local_strong_refs, node->local_weak_refs,
|
||||
@@ -6757,7 +6770,8 @@ static void print_binder_node_nilocked(struct seq_file *m,
|
||||
if (node->proc) {
|
||||
list_for_each_entry(w, &node->async_todo, entry)
|
||||
print_binder_work_ilocked(m, node->proc, " ",
|
||||
" pending async transaction", w);
|
||||
" pending async transaction",
|
||||
w, hash_ptrs);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6773,8 +6787,54 @@ static void print_binder_ref_olocked(struct seq_file *m,
|
||||
binder_node_unlock(ref->node);
|
||||
}
|
||||
|
||||
static void print_binder_proc(struct seq_file *m,
|
||||
struct binder_proc *proc, int print_all)
|
||||
/**
|
||||
* print_next_binder_node_ilocked() - Print binder_node from a locked list
|
||||
* @m: struct seq_file for output via seq_printf()
|
||||
* @proc: struct binder_proc we hold the inner_proc_lock to (if any)
|
||||
* @node: struct binder_node to print fields of
|
||||
* @prev_node: struct binder_node we hold a temporary reference to (if any)
|
||||
* @hash_ptrs: whether to hash @node's binder_uintptr_t fields
|
||||
*
|
||||
* Helper function to handle synchronization around printing a struct
|
||||
* binder_node while iterating through @proc->nodes or the dead nodes list.
|
||||
* Caller must hold either @proc->inner_lock (for live nodes) or
|
||||
* binder_dead_nodes_lock. This lock will be released during the body of this
|
||||
* function, but it will be reacquired before returning to the caller.
|
||||
*
|
||||
* Return: pointer to the struct binder_node we hold a tmpref on
|
||||
*/
|
||||
static struct binder_node *
|
||||
print_next_binder_node_ilocked(struct seq_file *m, struct binder_proc *proc,
|
||||
struct binder_node *node,
|
||||
struct binder_node *prev_node, bool hash_ptrs)
|
||||
{
|
||||
/*
|
||||
* Take a temporary reference on the node so that isn't freed while
|
||||
* we print it.
|
||||
*/
|
||||
binder_inc_node_tmpref_ilocked(node);
|
||||
/*
|
||||
* Live nodes need to drop the inner proc lock and dead nodes need to
|
||||
* drop the binder_dead_nodes_lock before trying to take the node lock.
|
||||
*/
|
||||
if (proc)
|
||||
binder_inner_proc_unlock(proc);
|
||||
else
|
||||
spin_unlock(&binder_dead_nodes_lock);
|
||||
if (prev_node)
|
||||
binder_put_node(prev_node);
|
||||
binder_node_inner_lock(node);
|
||||
print_binder_node_nilocked(m, node, hash_ptrs);
|
||||
binder_node_inner_unlock(node);
|
||||
if (proc)
|
||||
binder_inner_proc_lock(proc);
|
||||
else
|
||||
spin_lock(&binder_dead_nodes_lock);
|
||||
return node;
|
||||
}
|
||||
|
||||
static void print_binder_proc(struct seq_file *m, struct binder_proc *proc,
|
||||
bool print_all, bool hash_ptrs)
|
||||
{
|
||||
struct binder_work *w;
|
||||
struct rb_node *n;
|
||||
@@ -6787,31 +6847,19 @@ static void print_binder_proc(struct seq_file *m,
|
||||
header_pos = m->count;
|
||||
|
||||
binder_inner_proc_lock(proc);
|
||||
for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n))
|
||||
for (n = rb_first(&proc->threads); n; n = rb_next(n))
|
||||
print_binder_thread_ilocked(m, rb_entry(n, struct binder_thread,
|
||||
rb_node), print_all);
|
||||
rb_node), print_all, hash_ptrs);
|
||||
|
||||
for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) {
|
||||
for (n = rb_first(&proc->nodes); n; n = rb_next(n)) {
|
||||
struct binder_node *node = rb_entry(n, struct binder_node,
|
||||
rb_node);
|
||||
if (!print_all && !node->has_async_transaction)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* take a temporary reference on the node so it
|
||||
* survives and isn't removed from the tree
|
||||
* while we print it.
|
||||
*/
|
||||
binder_inc_node_tmpref_ilocked(node);
|
||||
/* Need to drop inner lock to take node lock */
|
||||
binder_inner_proc_unlock(proc);
|
||||
if (last_node)
|
||||
binder_put_node(last_node);
|
||||
binder_node_inner_lock(node);
|
||||
print_binder_node_nilocked(m, node);
|
||||
binder_node_inner_unlock(node);
|
||||
last_node = node;
|
||||
binder_inner_proc_lock(proc);
|
||||
last_node = print_next_binder_node_ilocked(m, proc, node,
|
||||
last_node,
|
||||
hash_ptrs);
|
||||
}
|
||||
binder_inner_proc_unlock(proc);
|
||||
if (last_node)
|
||||
@@ -6819,19 +6867,18 @@ static void print_binder_proc(struct seq_file *m,
|
||||
|
||||
if (print_all) {
|
||||
binder_proc_lock(proc);
|
||||
for (n = rb_first(&proc->refs_by_desc);
|
||||
n != NULL;
|
||||
n = rb_next(n))
|
||||
for (n = rb_first(&proc->refs_by_desc); n; n = rb_next(n))
|
||||
print_binder_ref_olocked(m, rb_entry(n,
|
||||
struct binder_ref,
|
||||
rb_node_desc));
|
||||
struct binder_ref,
|
||||
rb_node_desc));
|
||||
binder_proc_unlock(proc);
|
||||
}
|
||||
binder_alloc_print_allocated(m, &proc->alloc);
|
||||
binder_inner_proc_lock(proc);
|
||||
list_for_each_entry(w, &proc->todo, entry)
|
||||
print_binder_work_ilocked(m, proc, " ",
|
||||
" pending transaction", w);
|
||||
" pending transaction", w,
|
||||
hash_ptrs);
|
||||
list_for_each_entry(w, &proc->delivered_death, entry) {
|
||||
seq_puts(m, " has delivered dead binder\n");
|
||||
break;
|
||||
@@ -6958,7 +7005,7 @@ static void print_binder_proc_stats(struct seq_file *m,
|
||||
count = 0;
|
||||
ready_threads = 0;
|
||||
binder_inner_proc_lock(proc);
|
||||
for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n))
|
||||
for (n = rb_first(&proc->threads); n; n = rb_next(n))
|
||||
count++;
|
||||
|
||||
list_for_each_entry(thread, &proc->waiting_threads, waiting_thread_node)
|
||||
@@ -6972,7 +7019,7 @@ static void print_binder_proc_stats(struct seq_file *m,
|
||||
ready_threads,
|
||||
free_async_space);
|
||||
count = 0;
|
||||
for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n))
|
||||
for (n = rb_first(&proc->nodes); n; n = rb_next(n))
|
||||
count++;
|
||||
binder_inner_proc_unlock(proc);
|
||||
seq_printf(m, " nodes: %d\n", count);
|
||||
@@ -6980,7 +7027,7 @@ static void print_binder_proc_stats(struct seq_file *m,
|
||||
strong = 0;
|
||||
weak = 0;
|
||||
binder_proc_lock(proc);
|
||||
for (n = rb_first(&proc->refs_by_desc); n != NULL; n = rb_next(n)) {
|
||||
for (n = rb_first(&proc->refs_by_desc); n; n = rb_next(n)) {
|
||||
struct binder_ref *ref = rb_entry(n, struct binder_ref,
|
||||
rb_node_desc);
|
||||
count++;
|
||||
@@ -7007,7 +7054,7 @@ static void print_binder_proc_stats(struct seq_file *m,
|
||||
print_binder_stats(m, " ", &proc->stats);
|
||||
}
|
||||
|
||||
static int state_show(struct seq_file *m, void *unused)
|
||||
static void print_binder_state(struct seq_file *m, bool hash_ptrs)
|
||||
{
|
||||
struct binder_proc *proc;
|
||||
struct binder_node *node;
|
||||
@@ -7018,31 +7065,40 @@ static int state_show(struct seq_file *m, void *unused)
|
||||
spin_lock(&binder_dead_nodes_lock);
|
||||
if (!hlist_empty(&binder_dead_nodes))
|
||||
seq_puts(m, "dead nodes:\n");
|
||||
hlist_for_each_entry(node, &binder_dead_nodes, dead_node) {
|
||||
/*
|
||||
* take a temporary reference on the node so it
|
||||
* survives and isn't removed from the list
|
||||
* while we print it.
|
||||
*/
|
||||
node->tmp_refs++;
|
||||
spin_unlock(&binder_dead_nodes_lock);
|
||||
if (last_node)
|
||||
binder_put_node(last_node);
|
||||
binder_node_lock(node);
|
||||
print_binder_node_nilocked(m, node);
|
||||
binder_node_unlock(node);
|
||||
last_node = node;
|
||||
spin_lock(&binder_dead_nodes_lock);
|
||||
}
|
||||
hlist_for_each_entry(node, &binder_dead_nodes, dead_node)
|
||||
last_node = print_next_binder_node_ilocked(m, NULL, node,
|
||||
last_node,
|
||||
hash_ptrs);
|
||||
spin_unlock(&binder_dead_nodes_lock);
|
||||
if (last_node)
|
||||
binder_put_node(last_node);
|
||||
|
||||
mutex_lock(&binder_procs_lock);
|
||||
hlist_for_each_entry(proc, &binder_procs, proc_node)
|
||||
print_binder_proc(m, proc, 1);
|
||||
print_binder_proc(m, proc, true, hash_ptrs);
|
||||
mutex_unlock(&binder_procs_lock);
|
||||
}
|
||||
|
||||
static void print_binder_transactions(struct seq_file *m, bool hash_ptrs)
|
||||
{
|
||||
struct binder_proc *proc;
|
||||
|
||||
seq_puts(m, "binder transactions:\n");
|
||||
mutex_lock(&binder_procs_lock);
|
||||
hlist_for_each_entry(proc, &binder_procs, proc_node)
|
||||
print_binder_proc(m, proc, false, hash_ptrs);
|
||||
mutex_unlock(&binder_procs_lock);
|
||||
}
|
||||
|
||||
static int state_show(struct seq_file *m, void *unused)
|
||||
{
|
||||
print_binder_state(m, false);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int state_hashed_show(struct seq_file *m, void *unused)
|
||||
{
|
||||
print_binder_state(m, true);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -7064,14 +7120,13 @@ static int stats_show(struct seq_file *m, void *unused)
|
||||
|
||||
static int transactions_show(struct seq_file *m, void *unused)
|
||||
{
|
||||
struct binder_proc *proc;
|
||||
|
||||
seq_puts(m, "binder transactions:\n");
|
||||
mutex_lock(&binder_procs_lock);
|
||||
hlist_for_each_entry(proc, &binder_procs, proc_node)
|
||||
print_binder_proc(m, proc, 0);
|
||||
mutex_unlock(&binder_procs_lock);
|
||||
print_binder_transactions(m, false);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int transactions_hashed_show(struct seq_file *m, void *unused)
|
||||
{
|
||||
print_binder_transactions(m, true);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -7084,7 +7139,7 @@ static int proc_show(struct seq_file *m, void *unused)
|
||||
hlist_for_each_entry(itr, &binder_procs, proc_node) {
|
||||
if (itr->pid == pid) {
|
||||
seq_puts(m, "binder proc state:\n");
|
||||
print_binder_proc(m, itr, 1);
|
||||
print_binder_proc(m, itr, true, false);
|
||||
}
|
||||
}
|
||||
mutex_unlock(&binder_procs_lock);
|
||||
@@ -7151,8 +7206,10 @@ const struct file_operations binder_fops = {
|
||||
};
|
||||
|
||||
DEFINE_SHOW_ATTRIBUTE(state);
|
||||
DEFINE_SHOW_ATTRIBUTE(state_hashed);
|
||||
DEFINE_SHOW_ATTRIBUTE(stats);
|
||||
DEFINE_SHOW_ATTRIBUTE(transactions);
|
||||
DEFINE_SHOW_ATTRIBUTE(transactions_hashed);
|
||||
DEFINE_SHOW_ATTRIBUTE(transaction_log);
|
||||
|
||||
const struct binder_debugfs_entry binder_debugfs_entries[] = {
|
||||
@@ -7162,6 +7219,12 @@ const struct binder_debugfs_entry binder_debugfs_entries[] = {
|
||||
.fops = &state_fops,
|
||||
.data = NULL,
|
||||
},
|
||||
{
|
||||
.name = "state_hashed",
|
||||
.mode = 0444,
|
||||
.fops = &state_hashed_fops,
|
||||
.data = NULL,
|
||||
},
|
||||
{
|
||||
.name = "stats",
|
||||
.mode = 0444,
|
||||
@@ -7174,6 +7237,12 @@ const struct binder_debugfs_entry binder_debugfs_entries[] = {
|
||||
.fops = &transactions_fops,
|
||||
.data = NULL,
|
||||
},
|
||||
{
|
||||
.name = "transactions_hashed",
|
||||
.mode = 0444,
|
||||
.fops = &transactions_hashed_fops,
|
||||
.data = NULL,
|
||||
},
|
||||
{
|
||||
.name = "transaction_log",
|
||||
.mode = 0444,
|
||||
|
@@ -490,6 +490,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_lruvec_add_folio);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_lruvec_del_folio);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_add_lazyfree_bypass);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_do_async_mmap_readahead);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mm_free_page);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_update_page_mapcount);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_add_page_to_lrulist);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_del_page_from_lrulist);
|
||||
@@ -676,3 +677,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_filemap_fault_pre_folio_locked);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_filemap_folio_mapped);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_folio_remove_rmap_ptes);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_pageset_update);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_xhci_full_reset_on_remove);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mempool_alloc_skip_wait);
|
||||
|
@@ -1002,7 +1002,7 @@ static enum hrtimer_restart pm_suspend_timer_fn(struct hrtimer *timer)
|
||||
* If 'expires' is after the current time, we've been called
|
||||
* too early.
|
||||
*/
|
||||
if (expires > 0 && expires < ktime_get_mono_fast_ns()) {
|
||||
if (expires > 0 && expires <= ktime_get_mono_fast_ns()) {
|
||||
dev->power.timer_expires = 0;
|
||||
rpm_suspend(dev, dev->power.timer_autosuspends ?
|
||||
(RPM_ASYNC | RPM_AUTO) : RPM_ASYNC);
|
||||
|
@@ -284,15 +284,13 @@ static int kvm_arm_smmu_domain_finalize(struct kvm_arm_smmu_domain *kvm_smmu_dom
|
||||
return 0;
|
||||
}
|
||||
|
||||
kvm_smmu_domain->smmu = smmu;
|
||||
|
||||
if (kvm_smmu_domain->domain.type == IOMMU_DOMAIN_IDENTITY) {
|
||||
kvm_smmu_domain->id = KVM_IOMMU_DOMAIN_IDMAP_ID;
|
||||
/*
|
||||
* Identity domains doesn't use the DMA API, so no need to
|
||||
* set the domain aperture.
|
||||
*/
|
||||
return 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Default to stage-1. */
|
||||
@@ -325,7 +323,13 @@ static int kvm_arm_smmu_domain_finalize(struct kvm_arm_smmu_domain *kvm_smmu_dom
|
||||
|
||||
ret = kvm_call_hyp_nvhe_mc(__pkvm_host_iommu_alloc_domain,
|
||||
kvm_smmu_domain->id, kvm_smmu_domain->type);
|
||||
if (ret) {
|
||||
ida_free(&kvm_arm_smmu_domain_ida, kvm_smmu_domain->id);
|
||||
return ret;
|
||||
}
|
||||
|
||||
out:
|
||||
kvm_smmu_domain->smmu = smmu;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@@ -629,7 +629,6 @@ int ufshcd_mcq_abort(struct scsi_cmnd *cmd)
|
||||
int tag = scsi_cmd_to_rq(cmd)->tag;
|
||||
struct ufshcd_lrb *lrbp = &hba->lrb[tag];
|
||||
struct ufs_hw_queue *hwq;
|
||||
unsigned long flags;
|
||||
int err;
|
||||
|
||||
/* Skip task abort in case previous aborts failed and report failure */
|
||||
@@ -668,10 +667,5 @@ int ufshcd_mcq_abort(struct scsi_cmnd *cmd)
|
||||
return FAILED;
|
||||
}
|
||||
|
||||
spin_lock_irqsave(&hwq->cq_lock, flags);
|
||||
if (ufshcd_cmd_inflight(lrbp->cmd))
|
||||
ufshcd_release_scsi_cmd(hba, lrbp);
|
||||
spin_unlock_irqrestore(&hwq->cq_lock, flags);
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
|
@@ -6545,9 +6545,14 @@ static void ufshcd_err_handler(struct work_struct *work)
|
||||
up(&hba->host_sem);
|
||||
return;
|
||||
}
|
||||
spin_unlock_irqrestore(hba->host->host_lock, flags);
|
||||
|
||||
ufshcd_err_handling_prepare(hba);
|
||||
|
||||
spin_lock_irqsave(hba->host->host_lock, flags);
|
||||
ufshcd_set_eh_in_progress(hba);
|
||||
spin_unlock_irqrestore(hba->host->host_lock, flags);
|
||||
ufshcd_err_handling_prepare(hba);
|
||||
|
||||
/* Complete requests that have door-bell cleared by h/w */
|
||||
ufshcd_complete_requests(hba, false);
|
||||
spin_lock_irqsave(hba->host->host_lock, flags);
|
||||
|
@@ -18,6 +18,7 @@
|
||||
#include <linux/slab.h>
|
||||
#include <linux/dmi.h>
|
||||
#include <linux/dma-mapping.h>
|
||||
#include <trace/hooks/usb.h>
|
||||
|
||||
#include "xhci.h"
|
||||
#include "xhci-trace.h"
|
||||
@@ -196,6 +197,7 @@ int xhci_reset(struct xhci_hcd *xhci, u64 timeout_us)
|
||||
u32 command;
|
||||
u32 state;
|
||||
int ret;
|
||||
bool full_reset = 0;
|
||||
|
||||
state = readl(&xhci->op_regs->status);
|
||||
|
||||
@@ -224,8 +226,11 @@ int xhci_reset(struct xhci_hcd *xhci, u64 timeout_us)
|
||||
if (xhci->quirks & XHCI_INTEL_HOST)
|
||||
udelay(1000);
|
||||
|
||||
trace_android_vh_xhci_full_reset_on_remove(&full_reset);
|
||||
|
||||
ret = xhci_handshake_check_state(xhci, &xhci->op_regs->command,
|
||||
CMD_RESET, 0, timeout_us, XHCI_STATE_REMOVING);
|
||||
CMD_RESET, 0, timeout_us,
|
||||
full_reset ? 0 : XHCI_STATE_REMOVING);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
|
@@ -547,6 +547,14 @@ struct pd_rx_event {
|
||||
struct pd_message msg;
|
||||
};
|
||||
|
||||
struct altmode_vdm_event {
|
||||
struct kthread_work work;
|
||||
struct tcpm_port *port;
|
||||
u32 header;
|
||||
u32 *data;
|
||||
int cnt;
|
||||
};
|
||||
|
||||
static const char * const pd_rev[] = {
|
||||
[PD_REV10] = "rev1",
|
||||
[PD_REV20] = "rev2",
|
||||
@@ -1531,14 +1539,66 @@ static void tcpm_queue_vdm(struct tcpm_port *port, const u32 header,
|
||||
mod_vdm_delayed_work(port, 0);
|
||||
}
|
||||
|
||||
static void tcpm_queue_vdm_unlocked(struct tcpm_port *port, const u32 header,
|
||||
const u32 *data, int cnt)
|
||||
static void tcpm_queue_vdm_work(struct kthread_work *work)
|
||||
{
|
||||
struct altmode_vdm_event *event = container_of(work,
|
||||
struct altmode_vdm_event,
|
||||
work);
|
||||
struct tcpm_port *port = event->port;
|
||||
|
||||
mutex_lock(&port->lock);
|
||||
tcpm_queue_vdm(port, header, data, cnt);
|
||||
if (port->state != SRC_READY && port->state != SNK_READY) {
|
||||
tcpm_log_force(port, "dropping altmode_vdm_event");
|
||||
goto port_unlock;
|
||||
}
|
||||
|
||||
tcpm_queue_vdm(port, event->header, event->data, event->cnt);
|
||||
|
||||
port_unlock:
|
||||
kfree(event->data);
|
||||
kfree(event);
|
||||
mutex_unlock(&port->lock);
|
||||
}
|
||||
|
||||
static int tcpm_queue_vdm_unlocked(struct tcpm_port *port, const u32 header,
|
||||
const u32 *data, int cnt)
|
||||
{
|
||||
struct altmode_vdm_event *event;
|
||||
u32 *data_cpy;
|
||||
int ret = -ENOMEM;
|
||||
|
||||
event = kzalloc(sizeof(*event), GFP_KERNEL);
|
||||
if (!event)
|
||||
goto err_event;
|
||||
|
||||
data_cpy = kcalloc(cnt, sizeof(u32), GFP_KERNEL);
|
||||
if (!data_cpy)
|
||||
goto err_data;
|
||||
|
||||
kthread_init_work(&event->work, tcpm_queue_vdm_work);
|
||||
event->port = port;
|
||||
event->header = header;
|
||||
memcpy(data_cpy, data, sizeof(u32) * cnt);
|
||||
event->data = data_cpy;
|
||||
event->cnt = cnt;
|
||||
|
||||
ret = kthread_queue_work(port->wq, &event->work);
|
||||
if (!ret) {
|
||||
ret = -EBUSY;
|
||||
goto err_queue;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
err_queue:
|
||||
kfree(data_cpy);
|
||||
err_data:
|
||||
kfree(event);
|
||||
err_event:
|
||||
tcpm_log_force(port, "failed to queue altmode vdm, err:%d", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void svdm_consume_identity(struct tcpm_port *port, const u32 *p, int cnt)
|
||||
{
|
||||
u32 vdo = p[VDO_INDEX_IDH];
|
||||
@@ -2297,8 +2357,7 @@ static int tcpm_altmode_enter(struct typec_altmode *altmode, u32 *vdo)
|
||||
header = VDO(altmode->svid, vdo ? 2 : 1, svdm_version, CMD_ENTER_MODE);
|
||||
header |= VDO_OPOS(altmode->mode);
|
||||
|
||||
tcpm_queue_vdm_unlocked(port, header, vdo, vdo ? 1 : 0);
|
||||
return 0;
|
||||
return tcpm_queue_vdm_unlocked(port, header, vdo, vdo ? 1 : 0);
|
||||
}
|
||||
|
||||
static int tcpm_altmode_exit(struct typec_altmode *altmode)
|
||||
@@ -2314,8 +2373,7 @@ static int tcpm_altmode_exit(struct typec_altmode *altmode)
|
||||
header = VDO(altmode->svid, 1, svdm_version, CMD_EXIT_MODE);
|
||||
header |= VDO_OPOS(altmode->mode);
|
||||
|
||||
tcpm_queue_vdm_unlocked(port, header, NULL, 0);
|
||||
return 0;
|
||||
return tcpm_queue_vdm_unlocked(port, header, NULL, 0);
|
||||
}
|
||||
|
||||
static int tcpm_altmode_vdm(struct typec_altmode *altmode,
|
||||
@@ -2323,9 +2381,7 @@ static int tcpm_altmode_vdm(struct typec_altmode *altmode,
|
||||
{
|
||||
struct tcpm_port *port = typec_altmode_get_drvdata(altmode);
|
||||
|
||||
tcpm_queue_vdm_unlocked(port, header, data, count - 1);
|
||||
|
||||
return 0;
|
||||
return tcpm_queue_vdm_unlocked(port, header, data, count - 1);
|
||||
}
|
||||
|
||||
static const struct typec_altmode_ops tcpm_altmode_ops = {
|
||||
|
@@ -336,6 +336,7 @@ static struct workqueue_struct *z_erofs_workqueue __read_mostly;
|
||||
|
||||
#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
|
||||
static struct kthread_worker __rcu **z_erofs_pcpu_workers;
|
||||
static atomic_t erofs_percpu_workers_initialized = ATOMIC_INIT(0);
|
||||
|
||||
static void erofs_destroy_percpu_workers(void)
|
||||
{
|
||||
@@ -381,12 +382,8 @@ static int erofs_init_percpu_workers(void)
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
static inline void erofs_destroy_percpu_workers(void) {}
|
||||
static inline int erofs_init_percpu_workers(void) { return 0; }
|
||||
#endif
|
||||
|
||||
#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_EROFS_FS_PCPU_KTHREAD)
|
||||
#ifdef CONFIG_HOTPLUG_CPU
|
||||
static DEFINE_SPINLOCK(z_erofs_pcpu_worker_lock);
|
||||
static enum cpuhp_state erofs_cpuhp_state;
|
||||
|
||||
@@ -443,15 +440,53 @@ static void erofs_cpu_hotplug_destroy(void)
|
||||
if (erofs_cpuhp_state)
|
||||
cpuhp_remove_state_nocalls(erofs_cpuhp_state);
|
||||
}
|
||||
#else /* !CONFIG_HOTPLUG_CPU || !CONFIG_EROFS_FS_PCPU_KTHREAD */
|
||||
#else /* !CONFIG_HOTPLUG_CPU */
|
||||
static inline int erofs_cpu_hotplug_init(void) { return 0; }
|
||||
static inline void erofs_cpu_hotplug_destroy(void) {}
|
||||
#endif
|
||||
#endif/* CONFIG_HOTPLUG_CPU */
|
||||
static int z_erofs_init_pcpu_workers(struct super_block *sb)
|
||||
{
|
||||
int err;
|
||||
|
||||
if (atomic_xchg(&erofs_percpu_workers_initialized, 1))
|
||||
return 0;
|
||||
|
||||
err = erofs_init_percpu_workers();
|
||||
if (err) {
|
||||
erofs_err(sb, "per-cpu workers: failed to allocate.");
|
||||
goto err_init_percpu_workers;
|
||||
}
|
||||
|
||||
err = erofs_cpu_hotplug_init();
|
||||
if (err < 0) {
|
||||
erofs_err(sb, "per-cpu workers: failed CPU hotplug init.");
|
||||
goto err_cpuhp_init;
|
||||
}
|
||||
erofs_info(sb, "initialized per-cpu workers successfully.");
|
||||
return err;
|
||||
|
||||
err_cpuhp_init:
|
||||
erofs_destroy_percpu_workers();
|
||||
err_init_percpu_workers:
|
||||
atomic_set(&erofs_percpu_workers_initialized, 0);
|
||||
return err;
|
||||
}
|
||||
|
||||
static void z_erofs_destroy_pcpu_workers(void)
|
||||
{
|
||||
if (!atomic_xchg(&erofs_percpu_workers_initialized, 0))
|
||||
return;
|
||||
erofs_cpu_hotplug_destroy();
|
||||
erofs_destroy_percpu_workers();
|
||||
}
|
||||
#else /* !CONFIG_EROFS_FS_PCPU_KTHREAD */
|
||||
static inline int z_erofs_init_pcpu_workers(struct super_block *sb) { return 0; }
|
||||
static inline void z_erofs_destroy_pcpu_workers(void) {}
|
||||
#endif/* CONFIG_EROFS_FS_PCPU_KTHREAD */
|
||||
|
||||
void z_erofs_exit_zip_subsystem(void)
|
||||
{
|
||||
erofs_cpu_hotplug_destroy();
|
||||
erofs_destroy_percpu_workers();
|
||||
z_erofs_destroy_pcpu_workers();
|
||||
destroy_workqueue(z_erofs_workqueue);
|
||||
z_erofs_destroy_pcluster_pool();
|
||||
}
|
||||
@@ -467,23 +502,12 @@ int __init z_erofs_init_zip_subsystem(void)
|
||||
WQ_UNBOUND | WQ_HIGHPRI, num_possible_cpus());
|
||||
if (!z_erofs_workqueue) {
|
||||
err = -ENOMEM;
|
||||
goto out_error_workqueue_init;
|
||||
goto out_err_workqueue_init;
|
||||
}
|
||||
|
||||
err = erofs_init_percpu_workers();
|
||||
if (err)
|
||||
goto out_error_pcpu_worker;
|
||||
|
||||
err = erofs_cpu_hotplug_init();
|
||||
if (err < 0)
|
||||
goto out_error_cpuhp_init;
|
||||
return err;
|
||||
|
||||
out_error_cpuhp_init:
|
||||
erofs_destroy_percpu_workers();
|
||||
out_error_pcpu_worker:
|
||||
destroy_workqueue(z_erofs_workqueue);
|
||||
out_error_workqueue_init:
|
||||
out_err_workqueue_init:
|
||||
z_erofs_destroy_pcluster_pool();
|
||||
out_error_pcluster_pool:
|
||||
return err;
|
||||
@@ -711,8 +735,14 @@ static const struct address_space_operations z_erofs_cache_aops = {
|
||||
|
||||
int erofs_init_managed_cache(struct super_block *sb)
|
||||
{
|
||||
struct inode *const inode = new_inode(sb);
|
||||
struct inode *inode;
|
||||
int err;
|
||||
|
||||
err = z_erofs_init_pcpu_workers(sb);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
inode = new_inode(sb);
|
||||
if (!inode)
|
||||
return -ENOMEM;
|
||||
|
||||
|
@@ -799,6 +799,10 @@ int fuse_file_read_iter_initialize(
|
||||
.size = to->count,
|
||||
};
|
||||
|
||||
fri->frio = (struct fuse_read_iter_out) {
|
||||
.ret = fri->fri.size,
|
||||
};
|
||||
|
||||
/* TODO we can't assume 'to' is a kvec */
|
||||
/* TODO we also can't assume the vector has only one component */
|
||||
*fa = (struct fuse_bpf_args) {
|
||||
@@ -833,6 +837,11 @@ int fuse_file_read_iter_backing(struct fuse_bpf_args *fa,
|
||||
if (!iov_iter_count(to))
|
||||
return 0;
|
||||
|
||||
if ((iocb->ki_flags & IOCB_DIRECT) &&
|
||||
(!ff->backing_file->f_mapping->a_ops ||
|
||||
!ff->backing_file->f_mapping->a_ops->direct_IO))
|
||||
return -EINVAL;
|
||||
|
||||
/* TODO This just plain ignores any change to fuse_read_in */
|
||||
if (is_sync_kiocb(iocb)) {
|
||||
ret = vfs_iter_read(ff->backing_file, to, &iocb->ki_pos,
|
||||
@@ -855,13 +864,14 @@ int fuse_file_read_iter_backing(struct fuse_bpf_args *fa,
|
||||
fuse_bpf_aio_cleanup_handler(aio_req);
|
||||
}
|
||||
|
||||
frio->ret = ret;
|
||||
|
||||
/* TODO Need to point value at the buffer for post-modification */
|
||||
|
||||
out:
|
||||
fuse_file_accessed(file, ff->backing_file);
|
||||
|
||||
frio->ret = ret;
|
||||
return ret < 0 ? ret : 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
void *fuse_file_read_iter_finalize(struct fuse_bpf_args *fa,
|
||||
|
@@ -41,6 +41,24 @@ struct poll_table_struct;
|
||||
|
||||
/* define the enumeration of all cgroup subsystems */
|
||||
#define SUBSYS(_x) _x ## _cgrp_id,
|
||||
|
||||
#define CSS_COUNTERS_SIZE (CGROUP_SUBSYS_COUNT * sizeof(atomic_t))
|
||||
|
||||
/*
|
||||
* This should just use max(), but max() doesn't work in struct definitions.
|
||||
*
|
||||
* Originally, the space was reserved for per cgroup subsystem counters, where each counter was
|
||||
* the size of an atomic_t variable. However, it was later reused to fit a struct rcu_head
|
||||
* which is why the calculation considers the size of struct rcu_head.
|
||||
*
|
||||
* This macro is provided to ANDROID_BACKPORT_USE_ARRAY() which needs to reserve at least
|
||||
* enough memory to accommodate struct rcu_head. However, if we only reserve CSS_COUNTERS_SIZE,
|
||||
* that may not be enough space on kernels with a small amount of cgroup subsystems enabled. So,
|
||||
* we take the max between the two values to use in ANDROID_BACKPORT_USE_ARRAY().
|
||||
*/
|
||||
#define CGROUP_ROOT_BACKPORT_PADDING_SIZE \
|
||||
(CSS_COUNTERS_SIZE > sizeof(struct rcu_head) ? CSS_COUNTERS_SIZE : sizeof(struct rcu_head))
|
||||
|
||||
enum cgroup_subsys_id {
|
||||
#include <linux/cgroup_subsys.h>
|
||||
CGROUP_SUBSYS_COUNT,
|
||||
@@ -585,8 +603,12 @@ struct cgroup_root {
|
||||
/* The name for this hierarchy - may be empty */
|
||||
char name[MAX_CGROUP_ROOT_NAMELEN];
|
||||
|
||||
ANDROID_BACKPORT_USE_ARRAY(1, CGROUP_SUBSYS_COUNT * sizeof(atomic_t),
|
||||
struct rcu_head rcu);
|
||||
/* Use the original calculation to preserve the CRC value for the ABI. */
|
||||
#ifndef __GENKSYMS__
|
||||
ANDROID_BACKPORT_USE_ARRAY(1, CGROUP_ROOT_BACKPORT_PADDING_SIZE, struct rcu_head rcu);
|
||||
#else
|
||||
ANDROID_BACKPORT_USE_ARRAY(1, CGROUP_SUBSYS_COUNT * sizeof(atomic_t), struct rcu_head rcu);
|
||||
#endif
|
||||
};
|
||||
|
||||
/*
|
||||
|
@@ -277,15 +277,25 @@ struct mthp_stat {
|
||||
#ifdef CONFIG_SYSFS
|
||||
DECLARE_PER_CPU(struct mthp_stat, mthp_stats);
|
||||
|
||||
static inline void count_mthp_stat(int order, enum mthp_stat_item item)
|
||||
static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
|
||||
{
|
||||
if (order <= 0 || order > PMD_ORDER)
|
||||
return;
|
||||
|
||||
this_cpu_inc(mthp_stats.stats[order][item]);
|
||||
this_cpu_add(mthp_stats.stats[order][item], delta);
|
||||
}
|
||||
|
||||
static inline void count_mthp_stat(int order, enum mthp_stat_item item)
|
||||
{
|
||||
mod_mthp_stat(order, item, 1);
|
||||
}
|
||||
|
||||
unsigned long sum_mthp_stat(int order, enum mthp_stat_item item);
|
||||
#else
|
||||
static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void count_mthp_stat(int order, enum mthp_stat_item item)
|
||||
{
|
||||
}
|
||||
@@ -326,7 +336,7 @@ static inline int split_huge_page(struct page *page)
|
||||
{
|
||||
return split_huge_page_to_list(page, NULL);
|
||||
}
|
||||
void deferred_split_folio(struct folio *folio);
|
||||
void deferred_split_folio(struct folio *folio, bool partially_mapped);
|
||||
|
||||
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
|
||||
unsigned long address, bool freeze, struct folio *folio);
|
||||
@@ -486,7 +496,7 @@ static inline int split_huge_page(struct page *page)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
static inline void deferred_split_folio(struct folio *folio) {}
|
||||
static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
|
||||
#define split_huge_pmd(__vma, __pmd, __address) \
|
||||
do { } while (0)
|
||||
|
||||
|
@@ -4,6 +4,7 @@
|
||||
|
||||
#include <linux/sched/coredump.h> /* MMF_VM_HUGEPAGE */
|
||||
|
||||
extern unsigned int khugepaged_max_ptes_none __read_mostly;
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
extern struct attribute_group khugepaged_attr_group;
|
||||
|
||||
|
@@ -731,8 +731,15 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
|
||||
__mem_cgroup_uncharge_list(page_list);
|
||||
}
|
||||
|
||||
void mem_cgroup_replace_folio(struct folio *old, struct folio *new);
|
||||
void __mem_cgroup_uncharge_folios(struct folio_batch *folios);
|
||||
static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios)
|
||||
{
|
||||
if (mem_cgroup_disabled())
|
||||
return;
|
||||
__mem_cgroup_uncharge_folios(folios);
|
||||
}
|
||||
|
||||
void mem_cgroup_replace_folio(struct folio *old, struct folio *new);
|
||||
void mem_cgroup_migrate(struct folio *old, struct folio *new);
|
||||
|
||||
/**
|
||||
@@ -1171,6 +1178,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
|
||||
gfp_t gfp_mask,
|
||||
unsigned long *total_scanned);
|
||||
|
||||
extern int mem_cgroup_init(void);
|
||||
#else /* CONFIG_MEMCG */
|
||||
|
||||
#define MEM_CGROUP_ID_SHIFT 0
|
||||
@@ -1297,6 +1305,10 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void mem_cgroup_replace_folio(struct folio *old,
|
||||
struct folio *new)
|
||||
{
|
||||
@@ -1619,6 +1631,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int mem_cgroup_init(void) { return 0; }
|
||||
#endif /* CONFIG_MEMCG */
|
||||
|
||||
static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
|
||||
@@ -1682,18 +1696,18 @@ static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio,
|
||||
return folio_lruvec_lock_irq(folio);
|
||||
}
|
||||
|
||||
/* Don't lock again iff page's lruvec locked */
|
||||
static inline struct lruvec *folio_lruvec_relock_irqsave(struct folio *folio,
|
||||
struct lruvec *locked_lruvec, unsigned long *flags)
|
||||
/* Don't lock again iff folio's lruvec locked */
|
||||
static inline void folio_lruvec_relock_irqsave(struct folio *folio,
|
||||
struct lruvec **lruvecp, unsigned long *flags)
|
||||
{
|
||||
if (locked_lruvec) {
|
||||
if (folio_matches_lruvec(folio, locked_lruvec))
|
||||
return locked_lruvec;
|
||||
if (*lruvecp) {
|
||||
if (folio_matches_lruvec(folio, *lruvecp))
|
||||
return;
|
||||
|
||||
unlock_page_lruvec_irqrestore(locked_lruvec, *flags);
|
||||
unlock_page_lruvec_irqrestore(*lruvecp, *flags);
|
||||
}
|
||||
|
||||
return folio_lruvec_lock_irqsave(folio, flags);
|
||||
*lruvecp = folio_lruvec_lock_irqsave(folio, flags);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CGROUP_WRITEBACK
|
||||
|
@@ -39,6 +39,7 @@ struct anon_vma;
|
||||
struct anon_vma_chain;
|
||||
struct user_struct;
|
||||
struct pt_regs;
|
||||
struct folio_batch;
|
||||
|
||||
extern int sysctl_page_lock_unfairness;
|
||||
|
||||
@@ -1539,6 +1540,8 @@ static inline void folio_put_refs(struct folio *folio, int refs)
|
||||
__folio_put(folio);
|
||||
}
|
||||
|
||||
void folios_put_refs(struct folio_batch *folios, unsigned int *refs);
|
||||
|
||||
/*
|
||||
* union release_pages_arg - an array of pages or folios
|
||||
*
|
||||
@@ -1561,18 +1564,19 @@ void release_pages(release_pages_arg, int nr);
|
||||
/**
|
||||
* folios_put - Decrement the reference count on an array of folios.
|
||||
* @folios: The folios.
|
||||
* @nr: How many folios there are.
|
||||
*
|
||||
* Like folio_put(), but for an array of folios. This is more efficient
|
||||
* than writing the loop yourself as it will optimise the locks which
|
||||
* need to be taken if the folios are freed.
|
||||
* Like folio_put(), but for a batch of folios. This is more efficient
|
||||
* than writing the loop yourself as it will optimise the locks which need
|
||||
* to be taken if the folios are freed. The folios batch is returned
|
||||
* empty and ready to be reused for another batch; there is no need to
|
||||
* reinitialise it.
|
||||
*
|
||||
* Context: May be called in process or interrupt context, but not in NMI
|
||||
* context. May be called while holding a spinlock.
|
||||
*/
|
||||
static inline void folios_put(struct folio **folios, unsigned int nr)
|
||||
static inline void folios_put(struct folio_batch *folios)
|
||||
{
|
||||
release_pages(folios, nr);
|
||||
folios_put_refs(folios, NULL);
|
||||
}
|
||||
|
||||
static inline void put_page(struct page *page)
|
||||
|
@@ -37,6 +37,22 @@
|
||||
|
||||
#define NR_PAGE_ORDERS (MAX_ORDER + 1)
|
||||
|
||||
/* Defines the order for the number of pages that have a migrate type. */
|
||||
#ifndef CONFIG_PAGE_BLOCK_ORDER
|
||||
#define PAGE_BLOCK_ORDER MAX_ORDER
|
||||
#else
|
||||
#define PAGE_BLOCK_ORDER CONFIG_PAGE_BLOCK_ORDER
|
||||
#endif /* CONFIG_PAGE_BLOCK_ORDER */
|
||||
|
||||
/*
|
||||
* The MAX_ORDER, which defines the max order of pages to be allocated
|
||||
* by the buddy allocator, has to be larger or equal to the PAGE_BLOCK_ORDER,
|
||||
* which defines the order for the number of pages that can have a migrate type
|
||||
*/
|
||||
#if (PAGE_BLOCK_ORDER > MAX_ORDER)
|
||||
#error MAX_ORDER must be >= PAGE_BLOCK_ORDER
|
||||
#endif
|
||||
|
||||
/*
|
||||
* PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
|
||||
* costly to service. That is between allocation orders which should
|
||||
|
@@ -197,6 +197,7 @@ enum pageflags {
|
||||
/* At least one page in this folio has the hwpoison flag set */
|
||||
PG_has_hwpoisoned = PG_error,
|
||||
PG_large_rmappable = PG_workingset, /* anon or file-backed */
|
||||
PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */
|
||||
};
|
||||
|
||||
#define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1)
|
||||
@@ -372,54 +373,77 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n)
|
||||
#define FOLIO_PF_NO_COMPOUND 0
|
||||
#define FOLIO_PF_SECOND 1
|
||||
|
||||
#define FOLIO_HEAD_PAGE 0
|
||||
#define FOLIO_SECOND_PAGE 1
|
||||
|
||||
/*
|
||||
* Macros to create function definitions for page flags
|
||||
*/
|
||||
#define FOLIO_TEST_FLAG(name, page) \
|
||||
static __always_inline bool folio_test_##name(struct folio *folio) \
|
||||
{ return test_bit(PG_##name, folio_flags(folio, page)); }
|
||||
|
||||
#define FOLIO_SET_FLAG(name, page) \
|
||||
static __always_inline void folio_set_##name(struct folio *folio) \
|
||||
{ set_bit(PG_##name, folio_flags(folio, page)); }
|
||||
|
||||
#define FOLIO_CLEAR_FLAG(name, page) \
|
||||
static __always_inline void folio_clear_##name(struct folio *folio) \
|
||||
{ clear_bit(PG_##name, folio_flags(folio, page)); }
|
||||
|
||||
#define __FOLIO_SET_FLAG(name, page) \
|
||||
static __always_inline void __folio_set_##name(struct folio *folio) \
|
||||
{ __set_bit(PG_##name, folio_flags(folio, page)); }
|
||||
|
||||
#define __FOLIO_CLEAR_FLAG(name, page) \
|
||||
static __always_inline void __folio_clear_##name(struct folio *folio) \
|
||||
{ __clear_bit(PG_##name, folio_flags(folio, page)); }
|
||||
|
||||
#define FOLIO_TEST_SET_FLAG(name, page) \
|
||||
static __always_inline bool folio_test_set_##name(struct folio *folio) \
|
||||
{ return test_and_set_bit(PG_##name, folio_flags(folio, page)); }
|
||||
|
||||
#define FOLIO_TEST_CLEAR_FLAG(name, page) \
|
||||
static __always_inline bool folio_test_clear_##name(struct folio *folio) \
|
||||
{ return test_and_clear_bit(PG_##name, folio_flags(folio, page)); }
|
||||
|
||||
#define FOLIO_FLAG(name, page) \
|
||||
FOLIO_TEST_FLAG(name, page) \
|
||||
FOLIO_SET_FLAG(name, page) \
|
||||
FOLIO_CLEAR_FLAG(name, page)
|
||||
|
||||
#define TESTPAGEFLAG(uname, lname, policy) \
|
||||
static __always_inline bool folio_test_##lname(struct folio *folio) \
|
||||
{ return test_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \
|
||||
FOLIO_TEST_FLAG(lname, FOLIO_##policy) \
|
||||
static __always_inline int Page##uname(struct page *page) \
|
||||
{ return test_bit(PG_##lname, &policy(page, 0)->flags); }
|
||||
|
||||
#define SETPAGEFLAG(uname, lname, policy) \
|
||||
static __always_inline \
|
||||
void folio_set_##lname(struct folio *folio) \
|
||||
{ set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \
|
||||
FOLIO_SET_FLAG(lname, FOLIO_##policy) \
|
||||
static __always_inline void SetPage##uname(struct page *page) \
|
||||
{ set_bit(PG_##lname, &policy(page, 1)->flags); }
|
||||
|
||||
#define CLEARPAGEFLAG(uname, lname, policy) \
|
||||
static __always_inline \
|
||||
void folio_clear_##lname(struct folio *folio) \
|
||||
{ clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \
|
||||
FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \
|
||||
static __always_inline void ClearPage##uname(struct page *page) \
|
||||
{ clear_bit(PG_##lname, &policy(page, 1)->flags); }
|
||||
|
||||
#define __SETPAGEFLAG(uname, lname, policy) \
|
||||
static __always_inline \
|
||||
void __folio_set_##lname(struct folio *folio) \
|
||||
{ __set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \
|
||||
__FOLIO_SET_FLAG(lname, FOLIO_##policy) \
|
||||
static __always_inline void __SetPage##uname(struct page *page) \
|
||||
{ __set_bit(PG_##lname, &policy(page, 1)->flags); }
|
||||
|
||||
#define __CLEARPAGEFLAG(uname, lname, policy) \
|
||||
static __always_inline \
|
||||
void __folio_clear_##lname(struct folio *folio) \
|
||||
{ __clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \
|
||||
__FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \
|
||||
static __always_inline void __ClearPage##uname(struct page *page) \
|
||||
{ __clear_bit(PG_##lname, &policy(page, 1)->flags); }
|
||||
|
||||
#define TESTSETFLAG(uname, lname, policy) \
|
||||
static __always_inline \
|
||||
bool folio_test_set_##lname(struct folio *folio) \
|
||||
{ return test_and_set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \
|
||||
FOLIO_TEST_SET_FLAG(lname, FOLIO_##policy) \
|
||||
static __always_inline int TestSetPage##uname(struct page *page) \
|
||||
{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }
|
||||
|
||||
#define TESTCLEARFLAG(uname, lname, policy) \
|
||||
static __always_inline \
|
||||
bool folio_test_clear_##lname(struct folio *folio) \
|
||||
{ return test_and_clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \
|
||||
FOLIO_TEST_CLEAR_FLAG(lname, FOLIO_##policy) \
|
||||
static __always_inline int TestClearPage##uname(struct page *page) \
|
||||
{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
|
||||
|
||||
@@ -842,8 +866,18 @@ static inline void ClearPageCompound(struct page *page)
|
||||
ClearPageHead(page);
|
||||
}
|
||||
PAGEFLAG(LargeRmappable, large_rmappable, PF_SECOND)
|
||||
FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
|
||||
/*
|
||||
* PG_partially_mapped is protected by deferred_split split_queue_lock,
|
||||
* so its safe to use non-atomic set/clear.
|
||||
*/
|
||||
__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
|
||||
__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
|
||||
#else
|
||||
TESTPAGEFLAG_FALSE(LargeRmappable, large_rmappable)
|
||||
FOLIO_TEST_FLAG_FALSE(partially_mapped)
|
||||
__FOLIO_SET_FLAG_NOOP(partially_mapped)
|
||||
__FOLIO_CLEAR_FLAG_NOOP(partially_mapped)
|
||||
#endif
|
||||
|
||||
#define PG_head_mask ((1UL << PG_head))
|
||||
@@ -1111,7 +1145,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
|
||||
*/
|
||||
#define PAGE_FLAGS_SECOND \
|
||||
(0xffUL /* order */ | 1UL << PG_has_hwpoisoned | \
|
||||
1UL << PG_large_rmappable)
|
||||
1UL << PG_large_rmappable | 1UL << PG_partially_mapped)
|
||||
|
||||
#define PAGE_FLAGS_PRIVATE \
|
||||
(1UL << PG_private | 1UL << PG_private_2)
|
||||
|
@@ -3,10 +3,6 @@
|
||||
#define __LINUX_PAGEISOLATION_H
|
||||
|
||||
#ifdef CONFIG_MEMORY_ISOLATION
|
||||
static inline bool has_isolate_pageblock(struct zone *zone)
|
||||
{
|
||||
return zone->nr_isolate_pageblock;
|
||||
}
|
||||
static inline bool is_migrate_isolate_page(struct page *page)
|
||||
{
|
||||
return get_pageblock_migratetype(page) == MIGRATE_ISOLATE;
|
||||
@@ -16,10 +12,6 @@ static inline bool is_migrate_isolate(int migratetype)
|
||||
return migratetype == MIGRATE_ISOLATE;
|
||||
}
|
||||
#else
|
||||
static inline bool has_isolate_pageblock(struct zone *zone)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
static inline bool is_migrate_isolate_page(struct page *page)
|
||||
{
|
||||
return false;
|
||||
|
@@ -28,7 +28,7 @@ enum pageblock_bits {
|
||||
NR_PAGEBLOCK_BITS
|
||||
};
|
||||
|
||||
#ifdef CONFIG_HUGETLB_PAGE
|
||||
#if defined(CONFIG_HUGETLB_PAGE)
|
||||
|
||||
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
|
||||
|
||||
@@ -41,14 +41,18 @@ extern unsigned int pageblock_order;
|
||||
* Huge pages are a constant size, but don't exceed the maximum allocation
|
||||
* granularity.
|
||||
*/
|
||||
#define pageblock_order min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_ORDER)
|
||||
#define pageblock_order min_t(unsigned int, HUGETLB_PAGE_ORDER, PAGE_BLOCK_ORDER)
|
||||
|
||||
#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
|
||||
|
||||
#else /* CONFIG_HUGETLB_PAGE */
|
||||
#elif defined(CONFIG_TRANSPARENT_HUGEPAGE)
|
||||
|
||||
#define pageblock_order min_t(unsigned int, HPAGE_PMD_ORDER, PAGE_BLOCK_ORDER)
|
||||
|
||||
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
|
||||
/* If huge pages are not used, group by MAX_ORDER_NR_PAGES */
|
||||
#define pageblock_order MAX_ORDER
|
||||
#define pageblock_order PAGE_BLOCK_ORDER
|
||||
|
||||
#endif /* CONFIG_HUGETLB_PAGE */
|
||||
|
||||
|
@@ -742,7 +742,12 @@ int folio_mkclean(struct folio *);
|
||||
int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
|
||||
struct vm_area_struct *vma);
|
||||
|
||||
void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
|
||||
enum rmp_flags {
|
||||
RMP_LOCKED = 1 << 0,
|
||||
RMP_USE_SHARED_ZEROPAGE = 1 << 1,
|
||||
};
|
||||
|
||||
void remove_migration_ptes(struct folio *src, struct folio *dst, int flags);
|
||||
|
||||
int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
|
||||
|
||||
|
@@ -52,6 +52,8 @@ int trace_array_printk(struct trace_array *tr, unsigned long ip,
|
||||
int trace_array_init_printk(struct trace_array *tr);
|
||||
void trace_array_put(struct trace_array *tr);
|
||||
struct trace_array *trace_array_get_by_name(const char *name);
|
||||
struct trace_array *trace_array_get_by_name_ext(const char *name,
|
||||
const char *systems);
|
||||
int trace_array_destroy(struct trace_array *tr);
|
||||
|
||||
/* For osnoise tracer */
|
||||
@@ -88,6 +90,11 @@ static inline struct trace_array *trace_array_get_by_name(const char *name)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
static inline struct trace_array *trace_array_get_by_name_ext(
|
||||
const char *name, const char *systems)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
static inline int trace_array_destroy(struct trace_array *tr)
|
||||
{
|
||||
return 0;
|
||||
|
@@ -8,21 +8,46 @@
|
||||
#include <linux/refcount.h>
|
||||
#include <net/sock.h>
|
||||
|
||||
void unix_inflight(struct user_struct *user, struct file *fp);
|
||||
void unix_notinflight(struct user_struct *user, struct file *fp);
|
||||
void unix_destruct_scm(struct sk_buff *skb);
|
||||
void io_uring_destruct_scm(struct sk_buff *skb);
|
||||
void unix_gc(void);
|
||||
void wait_for_unix_gc(void);
|
||||
#if IS_ENABLED(CONFIG_UNIX)
|
||||
struct unix_sock *unix_get_socket(struct file *filp);
|
||||
#else
|
||||
static inline struct unix_sock *unix_get_socket(struct file *filp)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
|
||||
extern unsigned int unix_tot_inflight;
|
||||
void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver);
|
||||
void unix_del_edges(struct scm_fp_list *fpl);
|
||||
void unix_update_edges(struct unix_sock *receiver);
|
||||
int unix_prepare_fpl(struct scm_fp_list *fpl);
|
||||
void unix_destroy_fpl(struct scm_fp_list *fpl);
|
||||
void unix_gc(void);
|
||||
void wait_for_unix_gc(struct scm_fp_list *fpl);
|
||||
|
||||
struct unix_vertex {
|
||||
struct list_head edges;
|
||||
struct list_head entry;
|
||||
struct list_head scc_entry;
|
||||
unsigned long out_degree;
|
||||
unsigned long index;
|
||||
unsigned long scc_index;
|
||||
};
|
||||
|
||||
struct unix_edge {
|
||||
struct unix_sock *predecessor;
|
||||
struct unix_sock *successor;
|
||||
struct list_head vertex_entry;
|
||||
struct list_head stack_entry;
|
||||
};
|
||||
|
||||
struct sock *unix_peer_get(struct sock *sk);
|
||||
|
||||
#define UNIX_HASH_MOD (256 - 1)
|
||||
#define UNIX_HASH_SIZE (256 * 2)
|
||||
#define UNIX_HASH_BITS 8
|
||||
|
||||
extern unsigned int unix_tot_inflight;
|
||||
|
||||
struct unix_address {
|
||||
refcount_t refcnt;
|
||||
int len;
|
||||
@@ -42,6 +67,7 @@ struct unix_skb_parms {
|
||||
|
||||
struct scm_stat {
|
||||
atomic_t nr_fds;
|
||||
unsigned long nr_unix_fds;
|
||||
};
|
||||
|
||||
#define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb))
|
||||
@@ -54,12 +80,9 @@ struct unix_sock {
|
||||
struct path path;
|
||||
struct mutex iolock, bindlock;
|
||||
struct sock *peer;
|
||||
struct list_head link;
|
||||
unsigned long inflight;
|
||||
struct unix_vertex *vertex;
|
||||
struct sock *listener;
|
||||
spinlock_t lock;
|
||||
unsigned long gc_flags;
|
||||
#define UNIX_GC_CANDIDATE 0
|
||||
#define UNIX_GC_MAYBE_CYCLE 1
|
||||
struct socket_wq peer_wq;
|
||||
wait_queue_entry_t peer_wake;
|
||||
struct scm_stat scm_stat;
|
||||
|
@@ -22,11 +22,24 @@ struct scm_creds {
|
||||
kgid_t gid;
|
||||
};
|
||||
|
||||
#ifdef CONFIG_UNIX
|
||||
struct unix_edge;
|
||||
#endif
|
||||
|
||||
struct scm_fp_list {
|
||||
short count;
|
||||
short max;
|
||||
struct user_struct *user;
|
||||
struct file *fp[SCM_MAX_FD];
|
||||
#ifndef __GENKSYMS__
|
||||
#ifdef CONFIG_UNIX
|
||||
bool inflight;
|
||||
bool dead;
|
||||
struct list_head vertices;
|
||||
struct unix_edge *edges;
|
||||
#endif
|
||||
short count_unix;
|
||||
#endif
|
||||
};
|
||||
|
||||
struct scm_cookie {
|
||||
|
@@ -431,6 +431,9 @@ DECLARE_HOOK(android_vh_add_lazyfree_bypass,
|
||||
DECLARE_HOOK(android_vh_do_async_mmap_readahead,
|
||||
TP_PROTO(struct vm_fault *vmf, struct folio *folio, bool *skip),
|
||||
TP_ARGS(vmf, folio, skip));
|
||||
DECLARE_HOOK(android_vh_mm_free_page,
|
||||
TP_PROTO(struct page *page),
|
||||
TP_ARGS(page));
|
||||
|
||||
DECLARE_HOOK(android_vh_cma_debug_show_areas,
|
||||
TP_PROTO(bool *show),
|
||||
@@ -596,6 +599,9 @@ DECLARE_HOOK(android_vh_folio_remove_rmap_ptes,
|
||||
DECLARE_HOOK(android_vh_pageset_update,
|
||||
TP_PROTO(unsigned long *high, unsigned long *batch),
|
||||
TP_ARGS(high, batch));
|
||||
DECLARE_HOOK(android_vh_mempool_alloc_skip_wait,
|
||||
TP_PROTO(gfp_t *gfp_flags, bool *skip_wait),
|
||||
TP_ARGS(gfp_flags, skip_wait));
|
||||
#endif /* _TRACE_HOOK_MM_H */
|
||||
|
||||
/* This part must be outside protection */
|
||||
|
@@ -31,6 +31,10 @@ DECLARE_HOOK(android_vh_usb_new_device_added,
|
||||
TP_PROTO(struct usb_device *udev, int *err),
|
||||
TP_ARGS(udev, err));
|
||||
|
||||
DECLARE_HOOK(android_vh_xhci_full_reset_on_remove,
|
||||
TP_PROTO(bool *full_reset),
|
||||
TP_ARGS(full_reset));
|
||||
|
||||
#endif /* _TRACE_HOOK_USB_H */
|
||||
/* This part must be outside protection */
|
||||
#include <trace/define_trace.h>
|
||||
|
@@ -50,6 +50,7 @@
|
||||
#include <linux/writeback.h>
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/cpuset.h>
|
||||
#include <linux/memcontrol.h>
|
||||
#include <linux/cgroup.h>
|
||||
#include <linux/efi.h>
|
||||
#include <linux/tick.h>
|
||||
@@ -1062,6 +1063,7 @@ void start_kernel(void)
|
||||
proc_root_init();
|
||||
nsfs_init();
|
||||
cpuset_init();
|
||||
mem_cgroup_init();
|
||||
cgroup_init();
|
||||
taskstats_init_early();
|
||||
delayacct_init();
|
||||
|
@@ -452,7 +452,7 @@ struct kmem_cache *files_cachep;
|
||||
struct kmem_cache *fs_cachep;
|
||||
|
||||
/* SLAB cache for vm_area_struct structures */
|
||||
static struct kmem_cache *vm_area_cachep;
|
||||
struct kmem_cache *vm_area_cachep;
|
||||
|
||||
/* SLAB cache for mm_struct structures (tsk->mm) */
|
||||
static struct kmem_cache *mm_cachep;
|
||||
|
@@ -227,6 +227,14 @@ __irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff,
|
||||
|
||||
void irq_startup_managed(struct irq_desc *desc)
|
||||
{
|
||||
struct irq_data *d = irq_desc_get_irq_data(desc);
|
||||
|
||||
/*
|
||||
* Clear managed-shutdown flag, so we don't repeat managed-startup for
|
||||
* multiple hotplugs, and cause imbalanced disable depth.
|
||||
*/
|
||||
irqd_clr_managed_shutdown(d);
|
||||
|
||||
/*
|
||||
* Only start it up when the disable depth is 1, so that a disable,
|
||||
* hotunplug, hotplug sequence does not end up enabling it during
|
||||
|
@@ -211,13 +211,6 @@ static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu)
|
||||
!irq_data_get_irq_chip(data) || !cpumask_test_cpu(cpu, affinity))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Don't restore suspended interrupts here when a system comes back
|
||||
* from S3. They are reenabled via resume_device_irqs().
|
||||
*/
|
||||
if (desc->istate & IRQS_SUSPENDED)
|
||||
return;
|
||||
|
||||
if (irqd_is_managed_and_shutdown(data))
|
||||
irq_startup_managed(desc);
|
||||
|
||||
|
@@ -9538,16 +9538,19 @@ static int trace_array_create_dir(struct trace_array *tr)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct trace_array *trace_array_create(const char *name)
|
||||
static struct trace_array *
|
||||
trace_array_create_systems(const char *name, const char *systems)
|
||||
{
|
||||
struct trace_array_ext *tr_ext;
|
||||
struct trace_array *tr;
|
||||
int ret;
|
||||
|
||||
ret = -ENOMEM;
|
||||
tr = kzalloc(sizeof(*tr), GFP_KERNEL);
|
||||
if (!tr)
|
||||
tr_ext = kzalloc(sizeof(*tr_ext), GFP_KERNEL);
|
||||
if (!tr_ext)
|
||||
return ERR_PTR(ret);
|
||||
|
||||
tr = &tr_ext->trace_array;
|
||||
tr->name = kstrdup(name, GFP_KERNEL);
|
||||
if (!tr->name)
|
||||
goto out_free_tr;
|
||||
@@ -9558,6 +9561,12 @@ static struct trace_array *trace_array_create(const char *name)
|
||||
if (!zalloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL))
|
||||
goto out_free_tr;
|
||||
|
||||
if (systems) {
|
||||
tr_ext->system_names = kstrdup_const(systems, GFP_KERNEL);
|
||||
if (!tr_ext->system_names)
|
||||
goto out_free_tr;
|
||||
}
|
||||
|
||||
tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
|
||||
|
||||
cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
|
||||
@@ -9601,12 +9610,18 @@ static struct trace_array *trace_array_create(const char *name)
|
||||
free_trace_buffers(tr);
|
||||
free_cpumask_var(tr->pipe_cpumask);
|
||||
free_cpumask_var(tr->tracing_cpumask);
|
||||
kfree_const(tr_ext->system_names);
|
||||
kfree(tr->name);
|
||||
kfree(tr);
|
||||
kfree(tr_ext);
|
||||
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
static struct trace_array *trace_array_create(const char *name)
|
||||
{
|
||||
return trace_array_create_systems(name, NULL);
|
||||
}
|
||||
|
||||
static int instance_mkdir(const char *name)
|
||||
{
|
||||
struct trace_array *tr;
|
||||
@@ -9629,9 +9644,27 @@ out_unlock:
|
||||
return ret;
|
||||
}
|
||||
|
||||
const char *trace_array_get_system_names(struct trace_array *tr)
|
||||
{
|
||||
struct trace_array_ext *tr_ext;
|
||||
|
||||
if (tr == &global_trace)
|
||||
return NULL;
|
||||
|
||||
tr_ext = container_of(tr, struct trace_array_ext, trace_array);
|
||||
return tr_ext->system_names;
|
||||
}
|
||||
|
||||
struct trace_array *trace_array_get_by_name(const char *name)
|
||||
{
|
||||
return trace_array_get_by_name_ext(name, NULL);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(trace_array_get_by_name);
|
||||
|
||||
/**
|
||||
* trace_array_get_by_name - Create/Lookup a trace array, given its name.
|
||||
* trace_array_get_by_name_ext - Create/Lookup a trace array, given its name.
|
||||
* @name: The name of the trace array to be looked up/created.
|
||||
* @systems: A list of systems to create event directories for (NULL for all)
|
||||
*
|
||||
* Returns pointer to trace array with given name.
|
||||
* NULL, if it cannot be created.
|
||||
@@ -9645,7 +9678,8 @@ out_unlock:
|
||||
* trace_array_put() is called, user space can not delete it.
|
||||
*
|
||||
*/
|
||||
struct trace_array *trace_array_get_by_name(const char *name)
|
||||
struct trace_array *trace_array_get_by_name_ext(const char *name,
|
||||
const char *systems)
|
||||
{
|
||||
struct trace_array *tr;
|
||||
|
||||
@@ -9657,7 +9691,7 @@ struct trace_array *trace_array_get_by_name(const char *name)
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
tr = trace_array_create(name);
|
||||
tr = trace_array_create_systems(name, systems);
|
||||
|
||||
if (IS_ERR(tr))
|
||||
tr = NULL;
|
||||
@@ -9669,11 +9703,14 @@ out_unlock:
|
||||
mutex_unlock(&event_mutex);
|
||||
return tr;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(trace_array_get_by_name);
|
||||
EXPORT_SYMBOL_GPL(trace_array_get_by_name_ext);
|
||||
|
||||
static int __remove_instance(struct trace_array *tr)
|
||||
{
|
||||
int i;
|
||||
struct trace_array_ext *tr_ext = container_of(tr,
|
||||
struct trace_array_ext,
|
||||
trace_array);
|
||||
|
||||
/* Reference counter for a newly created trace array = 1. */
|
||||
if (tr->ref > 1 || (tr->current_trace && tr->trace_ref))
|
||||
@@ -9704,8 +9741,9 @@ static int __remove_instance(struct trace_array *tr)
|
||||
|
||||
free_cpumask_var(tr->pipe_cpumask);
|
||||
free_cpumask_var(tr->tracing_cpumask);
|
||||
kfree_const(tr_ext->system_names);
|
||||
kfree(tr->name);
|
||||
kfree(tr);
|
||||
kfree(tr_ext);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@@ -412,6 +412,11 @@ struct trace_array {
|
||||
struct trace_func_repeats __percpu *last_func_repeats;
|
||||
};
|
||||
|
||||
struct trace_array_ext {
|
||||
const char *system_names;
|
||||
struct trace_array trace_array;
|
||||
};
|
||||
|
||||
enum {
|
||||
TRACE_ARRAY_FL_GLOBAL = (1 << 0)
|
||||
};
|
||||
@@ -420,6 +425,7 @@ extern struct list_head ftrace_trace_arrays;
|
||||
|
||||
extern struct mutex trace_types_lock;
|
||||
|
||||
extern const char *trace_array_get_system_names(struct trace_array *tr);
|
||||
extern int trace_array_get(struct trace_array *tr);
|
||||
extern int tracing_check_open_get_tr(struct trace_array *tr);
|
||||
extern struct trace_array *trace_array_find(const char *instance);
|
||||
|
@@ -3041,6 +3041,27 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
|
||||
up_write(&trace_event_sem);
|
||||
}
|
||||
|
||||
static bool event_in_systems(struct trace_event_call *call,
|
||||
const char *systems)
|
||||
{
|
||||
const char *system;
|
||||
const char *p;
|
||||
|
||||
if (!systems)
|
||||
return true;
|
||||
|
||||
system = call->class->system;
|
||||
p = strstr(systems, system);
|
||||
if (!p)
|
||||
return false;
|
||||
|
||||
if (p != systems && !isspace(*(p - 1)) && *(p - 1) != ',')
|
||||
return false;
|
||||
|
||||
p += strlen(system);
|
||||
return !*p || isspace(*p) || *p == ',';
|
||||
}
|
||||
|
||||
static struct trace_event_file *
|
||||
trace_create_new_event(struct trace_event_call *call,
|
||||
struct trace_array *tr)
|
||||
@@ -3050,9 +3071,12 @@ trace_create_new_event(struct trace_event_call *call,
|
||||
struct trace_event_file *file;
|
||||
unsigned int first;
|
||||
|
||||
if (!event_in_systems(call, trace_array_get_system_names(tr)))
|
||||
return NULL;
|
||||
|
||||
file = kmem_cache_alloc(file_cachep, GFP_TRACE);
|
||||
if (!file)
|
||||
return NULL;
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
pid_list = rcu_dereference_protected(tr->filtered_pids,
|
||||
lockdep_is_held(&event_mutex));
|
||||
@@ -3117,8 +3141,17 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr)
|
||||
struct trace_event_file *file;
|
||||
|
||||
file = trace_create_new_event(call, tr);
|
||||
/*
|
||||
* trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed
|
||||
* allocation, or NULL if the event is not part of the tr->system_names.
|
||||
* When the event is not part of the tr->system_names, return zero, not
|
||||
* an error.
|
||||
*/
|
||||
if (!file)
|
||||
return -ENOMEM;
|
||||
return 0;
|
||||
|
||||
if (IS_ERR(file))
|
||||
return PTR_ERR(file);
|
||||
|
||||
if (eventdir_initialized)
|
||||
return event_create_dir(tr->event_dir, file);
|
||||
@@ -3157,8 +3190,17 @@ __trace_early_add_new_event(struct trace_event_call *call,
|
||||
int ret;
|
||||
|
||||
file = trace_create_new_event(call, tr);
|
||||
/*
|
||||
* trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed
|
||||
* allocation, or NULL if the event is not part of the tr->system_names.
|
||||
* When the event is not part of the tr->system_names, return zero, not
|
||||
* an error.
|
||||
*/
|
||||
if (!file)
|
||||
return -ENOMEM;
|
||||
return 0;
|
||||
|
||||
if (IS_ERR(file))
|
||||
return PTR_ERR(file);
|
||||
|
||||
ret = event_define_fields(call);
|
||||
if (ret)
|
||||
|
34
mm/Kconfig
34
mm/Kconfig
@@ -994,6 +994,40 @@ config CMA_AREAS
|
||||
|
||||
If unsure, leave the default value "7" in UMA and "19" in NUMA.
|
||||
|
||||
#
|
||||
# Select this config option from the architecture Kconfig, if available, to set
|
||||
# the max page order for physically contiguous allocations.
|
||||
#
|
||||
config ARCH_FORCE_MAX_ORDER
|
||||
int
|
||||
|
||||
#
|
||||
# When ARCH_FORCE_MAX_ORDER is not defined,
|
||||
# the default page block order is MAX_PAGE_ORDER (10) as per
|
||||
# include/linux/mmzone.h.
|
||||
#
|
||||
config PAGE_BLOCK_ORDER
|
||||
int "Page Block Order"
|
||||
range 1 10 if ARCH_FORCE_MAX_ORDER = 0 || ARCH_FORCE_MAX_ORDER = ""
|
||||
default 10 if ARCH_FORCE_MAX_ORDER = 0 || ARCH_FORCE_MAX_ORDER = ""
|
||||
range 1 ARCH_FORCE_MAX_ORDER if ARCH_FORCE_MAX_ORDER != 0
|
||||
default ARCH_FORCE_MAX_ORDER if ARCH_FORCE_MAX_ORDER != 0
|
||||
help
|
||||
The page block order refers to the power of two number of pages that
|
||||
are physically contiguous and can have a migrate type associated to
|
||||
them. The maximum size of the page block order is limited by
|
||||
ARCH_FORCE_MAX_ORDER.
|
||||
|
||||
This config allows overriding the default page block order when the
|
||||
page block order is required to be smaller than ARCH_FORCE_MAX_ORDER
|
||||
or MAX_ORDER.
|
||||
|
||||
Reducing pageblock order can negatively impact THP generation
|
||||
success rate. If your workloads uses THP heavily, please use this
|
||||
option with caution.
|
||||
|
||||
Don't change if unsure.
|
||||
|
||||
config MEM_SOFT_DIRTY
|
||||
bool "Track memory changes"
|
||||
depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS
|
||||
|
150
mm/huge_memory.c
150
mm/huge_memory.c
@@ -70,6 +70,7 @@ unsigned long transparent_hugepage_flags __read_mostly =
|
||||
(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
|
||||
|
||||
static struct shrinker deferred_split_shrinker;
|
||||
static bool split_underused_thp = true;
|
||||
|
||||
static atomic_t huge_zero_refcount;
|
||||
struct page *huge_zero_page __read_mostly;
|
||||
@@ -423,6 +424,27 @@ static ssize_t hpage_pmd_size_show(struct kobject *kobj,
|
||||
static struct kobj_attribute hpage_pmd_size_attr =
|
||||
__ATTR_RO(hpage_pmd_size);
|
||||
|
||||
static ssize_t split_underused_thp_show(struct kobject *kobj,
|
||||
struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
return sysfs_emit(buf, "%d\n", split_underused_thp);
|
||||
}
|
||||
|
||||
static ssize_t split_underused_thp_store(struct kobject *kobj,
|
||||
struct kobj_attribute *attr,
|
||||
const char *buf, size_t count)
|
||||
{
|
||||
int err = kstrtobool(buf, &split_underused_thp);
|
||||
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
static struct kobj_attribute split_underused_thp_attr = __ATTR(
|
||||
shrink_underused, 0644, split_underused_thp_show, split_underused_thp_store);
|
||||
|
||||
static struct attribute *hugepage_attr[] = {
|
||||
&enabled_attr.attr,
|
||||
&defrag_attr.attr,
|
||||
@@ -431,6 +453,7 @@ static struct attribute *hugepage_attr[] = {
|
||||
#ifdef CONFIG_SHMEM
|
||||
&shmem_enabled_attr.attr,
|
||||
#endif
|
||||
&split_underused_thp_attr.attr,
|
||||
NULL,
|
||||
};
|
||||
|
||||
@@ -1046,6 +1069,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
|
||||
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
|
||||
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
|
||||
mm_inc_nr_ptes(vma->vm_mm);
|
||||
deferred_split_folio(folio, false);
|
||||
spin_unlock(vmf->ptl);
|
||||
count_vm_event(THP_FAULT_ALLOC);
|
||||
count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
|
||||
@@ -2953,7 +2977,7 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
|
||||
return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
|
||||
}
|
||||
|
||||
static void remap_page(struct folio *folio, unsigned long nr)
|
||||
static void remap_page(struct folio *folio, unsigned long nr, int flags)
|
||||
{
|
||||
int i = 0;
|
||||
|
||||
@@ -2961,7 +2985,7 @@ static void remap_page(struct folio *folio, unsigned long nr)
|
||||
if (!folio_test_anon(folio))
|
||||
return;
|
||||
for (;;) {
|
||||
remove_migration_ptes(folio, folio, true);
|
||||
remove_migration_ptes(folio, folio, RMP_LOCKED | flags);
|
||||
i += folio_nr_pages(folio);
|
||||
if (i >= nr)
|
||||
break;
|
||||
@@ -3314,7 +3338,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
|
||||
|
||||
if (nr_dropped)
|
||||
shmem_uncharge(head->mapping->host, nr_dropped);
|
||||
remap_page(folio, nr);
|
||||
remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0);
|
||||
|
||||
for (i = 0; i < nr; i++) {
|
||||
struct page *subpage = folio_dst_page(folio, i);
|
||||
@@ -3376,8 +3400,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
|
||||
struct folio *folio = page_folio(page);
|
||||
struct deferred_split *ds_queue = get_deferred_split_queue(folio);
|
||||
XA_STATE(xas, &folio->mapping->i_pages, folio->index);
|
||||
struct anon_vma *anon_vma = NULL;
|
||||
bool is_anon = folio_test_anon(folio);
|
||||
struct address_space *mapping = NULL;
|
||||
struct anon_vma *anon_vma = NULL;
|
||||
int extra_pins, ret;
|
||||
pgoff_t end;
|
||||
bool is_hzp;
|
||||
@@ -3394,7 +3419,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
|
||||
if (folio_test_writeback(folio))
|
||||
return -EBUSY;
|
||||
|
||||
if (folio_test_anon(folio)) {
|
||||
if (is_anon) {
|
||||
/*
|
||||
* The caller does not necessarily hold an mmap_lock that would
|
||||
* prevent the anon_vma disappearing so we first we take a
|
||||
@@ -3495,6 +3520,14 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
|
||||
if (folio_order(folio) > 1 &&
|
||||
!list_empty(&folio->_deferred_list)) {
|
||||
ds_queue->split_queue_len--;
|
||||
if (folio_test_partially_mapped(folio))
|
||||
__folio_clear_partially_mapped(folio);
|
||||
/*
|
||||
* Reinitialize page_deferred_list after removing the
|
||||
* page from the split_queue, otherwise a subsequent
|
||||
* split will see list corruption when checking the
|
||||
* page_deferred_list.
|
||||
*/
|
||||
list_del_init(&folio->_deferred_list);
|
||||
}
|
||||
spin_unlock(&ds_queue->split_queue_lock);
|
||||
@@ -3522,7 +3555,7 @@ unfreeze:
|
||||
folio_ref_unfreeze(folio, 1 + extra_pins);
|
||||
remap:
|
||||
free_dst_pages(folio);
|
||||
remap_page(folio, folio_nr_pages(folio));
|
||||
remap_page(folio, folio_nr_pages(folio), 0);
|
||||
}
|
||||
|
||||
out_unlock:
|
||||
@@ -3572,6 +3605,8 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
|
||||
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
|
||||
if (!list_empty(&folio->_deferred_list)) {
|
||||
ds_queue->split_queue_len--;
|
||||
if (folio_test_partially_mapped(folio))
|
||||
__folio_clear_partially_mapped(folio);
|
||||
list_del_init(&folio->_deferred_list);
|
||||
unqueued = true;
|
||||
}
|
||||
@@ -3580,7 +3615,8 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
|
||||
return unqueued; /* useful for debug warnings */
|
||||
}
|
||||
|
||||
void deferred_split_folio(struct folio *folio)
|
||||
/* partially_mapped=false won't clear PG_partially_mapped folio flag */
|
||||
void deferred_split_folio(struct folio *folio, bool partially_mapped)
|
||||
{
|
||||
struct deferred_split *ds_queue = get_deferred_split_queue(folio);
|
||||
#ifdef CONFIG_MEMCG
|
||||
@@ -3595,6 +3631,9 @@ void deferred_split_folio(struct folio *folio)
|
||||
if (folio_order(folio) <= 1)
|
||||
return;
|
||||
|
||||
if (!partially_mapped && !split_underused_thp)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Exclude swapcache: originally to avoid a corrupt deferred split
|
||||
* queue. Nowadays that is fully prevented by mem_cgroup_swapout();
|
||||
@@ -3605,13 +3644,20 @@ void deferred_split_folio(struct folio *folio)
|
||||
if (folio_test_swapcache(folio))
|
||||
return;
|
||||
|
||||
if (!list_empty(&folio->_deferred_list))
|
||||
return;
|
||||
|
||||
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
|
||||
if (partially_mapped) {
|
||||
if (!folio_test_partially_mapped(folio)) {
|
||||
__folio_set_partially_mapped(folio);
|
||||
if (folio_test_pmd_mappable(folio))
|
||||
count_vm_event(THP_DEFERRED_SPLIT_PAGE);
|
||||
count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
|
||||
|
||||
}
|
||||
} else {
|
||||
/* partially mapped folios cannot become non-partially mapped */
|
||||
VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
|
||||
}
|
||||
if (list_empty(&folio->_deferred_list)) {
|
||||
count_vm_event(THP_DEFERRED_SPLIT_PAGE);
|
||||
count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
|
||||
list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
|
||||
ds_queue->split_queue_len++;
|
||||
#ifdef CONFIG_MEMCG
|
||||
@@ -3640,6 +3686,39 @@ static unsigned long deferred_split_count(struct shrinker *shrink,
|
||||
return READ_ONCE(ds_queue->split_queue_len);
|
||||
}
|
||||
|
||||
static bool thp_underused(struct folio *folio)
|
||||
{
|
||||
int num_zero_pages = 0, num_filled_pages = 0;
|
||||
void *kaddr;
|
||||
int i;
|
||||
|
||||
if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1)
|
||||
return false;
|
||||
|
||||
for (i = 0; i < folio_nr_pages(folio); i++) {
|
||||
kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
|
||||
if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
|
||||
num_zero_pages++;
|
||||
if (num_zero_pages > khugepaged_max_ptes_none) {
|
||||
kunmap_local(kaddr);
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* Another path for early exit once the number
|
||||
* of non-zero filled pages exceeds threshold.
|
||||
*/
|
||||
num_filled_pages++;
|
||||
if (num_filled_pages >= HPAGE_PMD_NR - khugepaged_max_ptes_none) {
|
||||
kunmap_local(kaddr);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
kunmap_local(kaddr);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static unsigned long deferred_split_scan(struct shrinker *shrink,
|
||||
struct shrink_control *sc)
|
||||
{
|
||||
@@ -3647,8 +3726,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
|
||||
struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
|
||||
unsigned long flags;
|
||||
LIST_HEAD(list);
|
||||
struct folio *folio, *next;
|
||||
int split = 0;
|
||||
struct folio *folio, *next, *prev = NULL;
|
||||
int split = 0, removed = 0;
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
if (sc->memcg)
|
||||
@@ -3663,6 +3742,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
|
||||
list_move(&folio->_deferred_list, &list);
|
||||
} else {
|
||||
/* We lost race with folio_put() */
|
||||
if (folio_test_partially_mapped(folio))
|
||||
__folio_clear_partially_mapped(folio);
|
||||
list_del_init(&folio->_deferred_list);
|
||||
ds_queue->split_queue_len--;
|
||||
}
|
||||
@@ -3672,20 +3753,55 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
|
||||
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
|
||||
|
||||
list_for_each_entry_safe(folio, next, &list, _deferred_list) {
|
||||
bool did_split = false;
|
||||
bool underused = false;
|
||||
|
||||
if (!folio_test_partially_mapped(folio)) {
|
||||
underused = thp_underused(folio);
|
||||
if (!underused)
|
||||
goto next;
|
||||
}
|
||||
if (!folio_trylock(folio))
|
||||
goto next;
|
||||
/* split_huge_page() removes page from list on success */
|
||||
if (!split_folio(folio))
|
||||
if (!split_folio(folio)) {
|
||||
did_split = true;
|
||||
split++;
|
||||
}
|
||||
folio_unlock(folio);
|
||||
next:
|
||||
folio_put(folio);
|
||||
/*
|
||||
* split_folio() removes folio from list on success.
|
||||
* Only add back to the queue if folio is partially mapped.
|
||||
* If thp_underused returns false, or if split_folio fails
|
||||
* in the case it was underused, then consider it used and
|
||||
* don't add it back to split_queue.
|
||||
*/
|
||||
if (did_split) {
|
||||
; /* folio already removed from list */
|
||||
} else if (!folio_test_partially_mapped(folio)) {
|
||||
list_del_init(&folio->_deferred_list);
|
||||
removed++;
|
||||
} else {
|
||||
/*
|
||||
* That unlocked list_del_init() above would be unsafe,
|
||||
* unless its folio is separated from any earlier folios
|
||||
* left on the list (which may be concurrently unqueued)
|
||||
* by one safe folio with refcount still raised.
|
||||
*/
|
||||
swap(folio, prev);
|
||||
}
|
||||
if (folio)
|
||||
folio_put(folio);
|
||||
}
|
||||
|
||||
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
|
||||
list_splice_tail(&list, &ds_queue->split_queue);
|
||||
ds_queue->split_queue_len -= removed;
|
||||
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
|
||||
|
||||
if (prev)
|
||||
folio_put(prev);
|
||||
|
||||
/*
|
||||
* Stop shrinker if we didn't split any page, but the queue is empty.
|
||||
* This can happen if pages were freed under us.
|
||||
|
@@ -470,7 +470,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
|
||||
#define K(x) ((x) << (PAGE_SHIFT-10))
|
||||
|
||||
extern char * const zone_names[MAX_NR_ZONES];
|
||||
extern unsigned long free_highatomics[MAX_NR_ZONES];
|
||||
extern unsigned long nr_free_highatomic[MAX_NR_ZONES];
|
||||
|
||||
/* perform sanity checks on struct pages being allocated or freed */
|
||||
DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
|
||||
@@ -721,8 +721,9 @@ extern void post_alloc_hook(struct page *page, unsigned int order,
|
||||
gfp_t gfp_flags);
|
||||
extern int user_min_free_kbytes;
|
||||
|
||||
extern void free_unref_page(struct page *page, unsigned int order);
|
||||
extern void free_unref_page_list(struct list_head *list);
|
||||
void free_unref_page(struct page *page, unsigned int order);
|
||||
void free_unref_folios(struct folio_batch *fbatch);
|
||||
void free_unref_page_list(struct list_head *list);
|
||||
|
||||
extern void zone_pcp_reset(struct zone *zone);
|
||||
extern void zone_pcp_disable(struct zone *zone);
|
||||
|
@@ -84,7 +84,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
|
||||
*
|
||||
* Note that these are only respected if collapse was initiated by khugepaged.
|
||||
*/
|
||||
static unsigned int khugepaged_max_ptes_none __read_mostly;
|
||||
unsigned int khugepaged_max_ptes_none __read_mostly;
|
||||
static unsigned int khugepaged_max_ptes_swap __read_mostly;
|
||||
static unsigned int khugepaged_max_ptes_shared __read_mostly;
|
||||
|
||||
@@ -1218,6 +1218,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
|
||||
pgtable_trans_huge_deposit(mm, pmd, pgtable);
|
||||
set_pmd_at(mm, address, pmd, _pmd);
|
||||
update_mmu_cache_pmd(vma, address, pmd);
|
||||
deferred_split_folio(folio, false);
|
||||
spin_unlock(pmd_ptl);
|
||||
|
||||
hpage = NULL;
|
||||
|
@@ -33,6 +33,7 @@
|
||||
#include <linux/shmem_fs.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/pagevec.h>
|
||||
#include <linux/vm_event_item.h>
|
||||
#include <linux/smp.h>
|
||||
#include <linux/page-flags.h>
|
||||
@@ -95,6 +96,9 @@ static bool cgroup_memory_nokmem __ro_after_init;
|
||||
/* BPF memory accounting disabled? */
|
||||
static bool cgroup_memory_nobpf __ro_after_init;
|
||||
|
||||
static struct kmem_cache *memcg_cachep;
|
||||
static struct kmem_cache *memcg_pn_cachep;
|
||||
|
||||
#ifdef CONFIG_CGROUP_WRITEBACK
|
||||
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
|
||||
#endif
|
||||
@@ -5384,7 +5388,8 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
|
||||
{
|
||||
struct mem_cgroup_per_node *pn;
|
||||
|
||||
pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node);
|
||||
pn = kmem_cache_alloc_node(memcg_pn_cachep, GFP_KERNEL | __GFP_ZERO,
|
||||
node);
|
||||
if (!pn)
|
||||
return 1;
|
||||
|
||||
@@ -5440,7 +5445,7 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
|
||||
int __maybe_unused i;
|
||||
long error = -ENOMEM;
|
||||
|
||||
memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL);
|
||||
memcg = kmem_cache_zalloc(memcg_cachep, GFP_KERNEL);
|
||||
if (!memcg)
|
||||
return ERR_PTR(error);
|
||||
|
||||
@@ -6017,8 +6022,6 @@ int mem_cgroup_move_account(struct folio *folio,
|
||||
css_get(&to->css);
|
||||
css_put(&from->css);
|
||||
|
||||
/* Warning should never happen, so don't worry about refcount non-0 */
|
||||
WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
|
||||
folio->memcg_data = (unsigned long)to;
|
||||
|
||||
__folio_memcg_unlock(from);
|
||||
@@ -6389,9 +6392,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
|
||||
enum mc_target_type target_type;
|
||||
union mc_target target;
|
||||
struct folio *folio;
|
||||
bool tried_split_before = false;
|
||||
|
||||
retry_pmd:
|
||||
ptl = pmd_trans_huge_lock(pmd, vma);
|
||||
if (ptl) {
|
||||
if (mc.precharge < HPAGE_PMD_NR) {
|
||||
@@ -6401,27 +6402,6 @@ retry_pmd:
|
||||
target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
|
||||
if (target_type == MC_TARGET_PAGE) {
|
||||
folio = target.folio;
|
||||
/*
|
||||
* Deferred split queue locking depends on memcg,
|
||||
* and unqueue is unsafe unless folio refcount is 0:
|
||||
* split or skip if on the queue? first try to split.
|
||||
*/
|
||||
if (!list_empty(&folio->_deferred_list)) {
|
||||
spin_unlock(ptl);
|
||||
if (!tried_split_before)
|
||||
split_folio(folio);
|
||||
folio_unlock(folio);
|
||||
folio_put(folio);
|
||||
if (tried_split_before)
|
||||
return 0;
|
||||
tried_split_before = true;
|
||||
goto retry_pmd;
|
||||
}
|
||||
/*
|
||||
* So long as that pmd lock is held, the folio cannot
|
||||
* be racily added to the _deferred_list, because
|
||||
* __folio_remove_rmap() will find !partially_mapped.
|
||||
*/
|
||||
if (folio_isolate_lru(folio)) {
|
||||
if (!mem_cgroup_move_account(folio, true,
|
||||
mc.from, mc.to)) {
|
||||
@@ -7418,6 +7398,18 @@ void __mem_cgroup_uncharge_list(struct list_head *page_list)
|
||||
uncharge_batch(&ug);
|
||||
}
|
||||
|
||||
void __mem_cgroup_uncharge_folios(struct folio_batch *folios)
|
||||
{
|
||||
struct uncharge_gather ug;
|
||||
unsigned int i;
|
||||
|
||||
uncharge_gather_clear(&ug);
|
||||
for (i = 0; i < folios->nr; i++)
|
||||
uncharge_folio(folios->folios[i], &ug);
|
||||
if (ug.memcg)
|
||||
uncharge_batch(&ug);
|
||||
}
|
||||
|
||||
/**
|
||||
* mem_cgroup_replace_folio - Charge a folio's replacement.
|
||||
* @old: Currently circulating folio.
|
||||
@@ -7606,15 +7598,16 @@ static int __init cgroup_memory(char *s)
|
||||
__setup("cgroup.memory=", cgroup_memory);
|
||||
|
||||
/*
|
||||
* subsys_initcall() for memory controller.
|
||||
* Memory controller init before cgroup_init() initialize root_mem_cgroup.
|
||||
*
|
||||
* Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
|
||||
* context because of lock dependencies (cgroup_lock -> cpu hotplug) but
|
||||
* basically everything that doesn't depend on a specific mem_cgroup structure
|
||||
* should be initialized from here.
|
||||
*/
|
||||
static int __init mem_cgroup_init(void)
|
||||
int __init mem_cgroup_init(void)
|
||||
{
|
||||
unsigned int memcg_size;
|
||||
int cpu, node;
|
||||
|
||||
/*
|
||||
@@ -7632,6 +7625,13 @@ static int __init mem_cgroup_init(void)
|
||||
INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
|
||||
drain_local_stock);
|
||||
|
||||
memcg_size = struct_size_t(struct mem_cgroup, nodeinfo, nr_node_ids);
|
||||
memcg_cachep = kmem_cache_create("mem_cgroup", memcg_size, 0,
|
||||
SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL);
|
||||
|
||||
memcg_pn_cachep = KMEM_CACHE(mem_cgroup_per_node,
|
||||
SLAB_PANIC | SLAB_HWCACHE_ALIGN);
|
||||
|
||||
for_each_node(node) {
|
||||
struct mem_cgroup_tree_per_node *rtpn;
|
||||
|
||||
@@ -7645,7 +7645,6 @@ static int __init mem_cgroup_init(void)
|
||||
|
||||
return 0;
|
||||
}
|
||||
subsys_initcall(mem_cgroup_init);
|
||||
|
||||
#ifdef CONFIG_SWAP
|
||||
static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
|
||||
|
@@ -19,6 +19,8 @@
|
||||
#include <linux/mempool.h>
|
||||
#include <linux/writeback.h>
|
||||
#include "slab.h"
|
||||
#undef CREATE_TRACE_POINTS
|
||||
#include <trace/hooks/mm.h>
|
||||
|
||||
#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON)
|
||||
static void poison_error(mempool_t *pool, void *element, size_t size,
|
||||
@@ -383,6 +385,7 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
|
||||
unsigned long flags;
|
||||
wait_queue_entry_t wait;
|
||||
gfp_t gfp_temp;
|
||||
bool skip_wait = false;
|
||||
|
||||
VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
|
||||
might_alloc(gfp_mask);
|
||||
@@ -428,6 +431,11 @@ repeat_alloc:
|
||||
spin_unlock_irqrestore(&pool->lock, flags);
|
||||
return NULL;
|
||||
}
|
||||
trace_android_vh_mempool_alloc_skip_wait(&gfp_temp, &skip_wait);
|
||||
if (skip_wait) {
|
||||
spin_unlock_irqrestore(&pool->lock, flags);
|
||||
goto repeat_alloc;
|
||||
}
|
||||
|
||||
/* Let's wait for someone else to return an element to @pool */
|
||||
init_wait(&wait);
|
||||
|
106
mm/migrate.c
106
mm/migrate.c
@@ -182,13 +182,57 @@ void putback_movable_pages(struct list_head *l)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(putback_movable_pages);
|
||||
|
||||
static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
|
||||
struct folio *folio,
|
||||
unsigned long idx)
|
||||
{
|
||||
struct page *page = folio_page(folio, idx);
|
||||
bool contains_data;
|
||||
pte_t newpte;
|
||||
void *addr;
|
||||
|
||||
VM_BUG_ON_PAGE(PageCompound(page), page);
|
||||
VM_BUG_ON_PAGE(!PageAnon(page), page);
|
||||
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
||||
VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
|
||||
|
||||
if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) ||
|
||||
mm_forbids_zeropage(pvmw->vma->vm_mm))
|
||||
return false;
|
||||
|
||||
/*
|
||||
* The pmd entry mapping the old thp was flushed and the pte mapping
|
||||
* this subpage has been non present. If the subpage is only zero-filled
|
||||
* then map it to the shared zeropage.
|
||||
*/
|
||||
addr = kmap_local_page(page);
|
||||
contains_data = memchr_inv(addr, 0, PAGE_SIZE);
|
||||
kunmap_local(addr);
|
||||
|
||||
if (contains_data)
|
||||
return false;
|
||||
|
||||
newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
|
||||
pvmw->vma->vm_page_prot));
|
||||
set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte);
|
||||
|
||||
dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
|
||||
return true;
|
||||
}
|
||||
|
||||
struct rmap_walk_arg {
|
||||
struct folio *folio;
|
||||
bool map_unused_to_zeropage;
|
||||
};
|
||||
|
||||
/*
|
||||
* Restore a potential migration pte to a working pte entry
|
||||
*/
|
||||
static bool remove_migration_pte(struct folio *dst,
|
||||
struct vm_area_struct *vma, unsigned long addr, void *arg)
|
||||
{
|
||||
struct folio *src = arg;
|
||||
struct rmap_walk_arg *rmap_walk_arg = arg;
|
||||
struct folio *src = rmap_walk_arg->folio;
|
||||
DEFINE_FOLIO_VMA_WALK(pvmw, src, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
|
||||
|
||||
while (page_vma_mapped_walk(&pvmw)) {
|
||||
@@ -228,6 +272,9 @@ static bool remove_migration_pte(struct folio *dst,
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
if (rmap_walk_arg->map_unused_to_zeropage &&
|
||||
try_to_map_unused_to_zeropage(&pvmw, folio, idx))
|
||||
continue;
|
||||
|
||||
folio_get(folio);
|
||||
pte = mk_pte(page, READ_ONCE(vma->vm_page_prot));
|
||||
@@ -303,14 +350,21 @@ static bool remove_migration_pte(struct folio *dst,
|
||||
* Get rid of all migration entries and replace them by
|
||||
* references to the indicated page.
|
||||
*/
|
||||
void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked)
|
||||
void remove_migration_ptes(struct folio *src, struct folio *dst, int flags)
|
||||
{
|
||||
struct rmap_walk_control rwc = {
|
||||
.rmap_one = remove_migration_pte,
|
||||
.arg = src,
|
||||
struct rmap_walk_arg rmap_walk_arg = {
|
||||
.folio = src,
|
||||
.map_unused_to_zeropage = flags & RMP_USE_SHARED_ZEROPAGE,
|
||||
};
|
||||
|
||||
if (locked)
|
||||
struct rmap_walk_control rwc = {
|
||||
.rmap_one = remove_migration_pte,
|
||||
.arg = &rmap_walk_arg,
|
||||
};
|
||||
|
||||
VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src != dst), src);
|
||||
|
||||
if (flags & RMP_LOCKED)
|
||||
rmap_walk_locked(dst, &rwc);
|
||||
else
|
||||
rmap_walk(dst, &rwc);
|
||||
@@ -461,7 +515,8 @@ int folio_migrate_mapping(struct address_space *mapping,
|
||||
}
|
||||
|
||||
/* Take off deferred split queue while frozen and memcg set */
|
||||
folio_unqueue_deferred_split(folio);
|
||||
if (folio_test_large(folio) && folio_test_large_rmappable(folio))
|
||||
folio_unqueue_deferred_split(folio);
|
||||
|
||||
/*
|
||||
* Now we know that no one else is looking at the folio:
|
||||
@@ -933,7 +988,7 @@ static int writeout(struct address_space *mapping, struct folio *folio)
|
||||
* At this point we know that the migration attempt cannot
|
||||
* be successful.
|
||||
*/
|
||||
remove_migration_ptes(folio, folio, false);
|
||||
remove_migration_ptes(folio, folio, 0);
|
||||
|
||||
rc = mapping->a_ops->writepage(&folio->page, &wbc);
|
||||
|
||||
@@ -1096,7 +1151,7 @@ static void migrate_folio_undo_src(struct folio *src,
|
||||
struct list_head *ret)
|
||||
{
|
||||
if (page_was_mapped)
|
||||
remove_migration_ptes(src, src, false);
|
||||
remove_migration_ptes(src, src, 0);
|
||||
/* Drop an anon_vma reference if we took one */
|
||||
if (anon_vma)
|
||||
put_anon_vma(anon_vma);
|
||||
@@ -1335,7 +1390,7 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
|
||||
lru_add_drain();
|
||||
|
||||
if (old_page_state & PAGE_WAS_MAPPED)
|
||||
remove_migration_ptes(src, dst, false);
|
||||
remove_migration_ptes(src, dst, 0);
|
||||
|
||||
out_unlock_both:
|
||||
folio_unlock(dst);
|
||||
@@ -1474,7 +1529,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
|
||||
|
||||
if (page_was_mapped)
|
||||
remove_migration_ptes(src,
|
||||
rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
|
||||
rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
|
||||
|
||||
unlock_put_anon:
|
||||
folio_unlock(dst);
|
||||
@@ -1702,6 +1757,35 @@ static int migrate_pages_batch(struct list_head *from,
|
||||
|
||||
cond_resched();
|
||||
|
||||
/*
|
||||
* The rare folio on the deferred split list should
|
||||
* be split now. It should not count as a failure:
|
||||
* but increment nr_failed because, without doing so,
|
||||
* migrate_pages() may report success with (split but
|
||||
* unmigrated) pages still on its fromlist; whereas it
|
||||
* always reports success when its fromlist is empty.
|
||||
*
|
||||
* Only check it without removing it from the list.
|
||||
* Since the folio can be on deferred_split_scan()
|
||||
* local list and removing it can cause the local list
|
||||
* corruption. Folio split process below can handle it
|
||||
* with the help of folio_ref_freeze().
|
||||
*
|
||||
* nr_pages > 2 is needed to avoid checking order-1
|
||||
* page cache folios. They exist, in contrast to
|
||||
* non-existent order-1 anonymous folios, and do not
|
||||
* use _deferred_list.
|
||||
*/
|
||||
if (nr_pages > 2 &&
|
||||
!list_empty(&folio->_deferred_list) &&
|
||||
folio_test_partially_mapped(folio)) {
|
||||
if (!try_split_folio(folio, split_folios, mode)) {
|
||||
nr_failed++;
|
||||
stats->nr_thp_split += is_thp;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Large folio migration might be unsupported or
|
||||
* the allocation might be failed so we should retry
|
||||
|
@@ -422,7 +422,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
|
||||
continue;
|
||||
|
||||
folio = page_folio(page);
|
||||
remove_migration_ptes(folio, folio, false);
|
||||
remove_migration_ptes(folio, folio, 0);
|
||||
|
||||
src_pfns[i] = 0;
|
||||
folio_unlock(folio);
|
||||
@@ -840,7 +840,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
|
||||
|
||||
src = page_folio(page);
|
||||
dst = page_folio(newpage);
|
||||
remove_migration_ptes(src, dst, false);
|
||||
remove_migration_ptes(src, dst, 0);
|
||||
folio_unlock(src);
|
||||
|
||||
if (is_zone_device_page(page))
|
||||
|
@@ -208,8 +208,7 @@ static void mlock_folio_batch(struct folio_batch *fbatch)
|
||||
|
||||
if (lruvec)
|
||||
unlock_page_lruvec_irq(lruvec);
|
||||
folios_put(fbatch->folios, folio_batch_count(fbatch));
|
||||
folio_batch_reinit(fbatch);
|
||||
folios_put(fbatch);
|
||||
}
|
||||
|
||||
void mlock_drain_local(void)
|
||||
|
@@ -1558,7 +1558,7 @@ static inline void setup_usemap(struct zone *zone) {}
|
||||
/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
|
||||
void __init set_pageblock_order(void)
|
||||
{
|
||||
unsigned int order = MAX_ORDER;
|
||||
unsigned int order = PAGE_BLOCK_ORDER;
|
||||
|
||||
/* Check that pageblock_nr_pages has not already been setup */
|
||||
if (pageblock_order)
|
||||
|
151
mm/page_alloc.c
151
mm/page_alloc.c
@@ -33,6 +33,7 @@
|
||||
#include <linux/sysctl.h>
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/cpuset.h>
|
||||
#include <linux/pagevec.h>
|
||||
#include <linux/memory_hotplug.h>
|
||||
#include <linux/nodemask.h>
|
||||
#include <linux/vmstat.h>
|
||||
@@ -323,7 +324,7 @@ const char * const migratetype_names[MIGRATE_TYPES] = {
|
||||
#endif
|
||||
};
|
||||
|
||||
unsigned long free_highatomics[MAX_NR_ZONES] = {0};
|
||||
unsigned long nr_free_highatomic[MAX_NR_ZONES] = {0};
|
||||
|
||||
int min_free_kbytes = 1024;
|
||||
int user_min_free_kbytes = -1;
|
||||
@@ -770,8 +771,8 @@ static inline void account_freepages(struct zone *zone, int nr_pages,
|
||||
if (is_migrate_cma(migratetype))
|
||||
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
|
||||
else if (is_migrate_highatomic(migratetype))
|
||||
WRITE_ONCE(free_highatomics[zone_idx(zone)],
|
||||
free_highatomics[zone_idx(zone)] + nr_pages);
|
||||
WRITE_ONCE(nr_free_highatomic[zone_idx(zone)],
|
||||
nr_free_highatomic[zone_idx(zone)] + nr_pages);
|
||||
}
|
||||
|
||||
/* Used for pages not on another list */
|
||||
@@ -921,7 +922,6 @@ static inline void __free_one_page(struct page *page,
|
||||
VM_BUG_ON_PAGE(page->flags & check_flags, page);
|
||||
|
||||
VM_BUG_ON(migratetype == -1);
|
||||
|
||||
VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
|
||||
VM_BUG_ON_PAGE(bad_range(zone, page), page);
|
||||
|
||||
@@ -1237,6 +1237,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
|
||||
}
|
||||
}
|
||||
(page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
|
||||
trace_android_vh_mm_free_page(page + i);
|
||||
}
|
||||
}
|
||||
if (PageMappingFlags(page))
|
||||
@@ -1252,6 +1253,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
|
||||
|
||||
page_cpupid_reset_last(page);
|
||||
page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
|
||||
trace_android_vh_mm_free_page(page);
|
||||
reset_page_owner(page, order);
|
||||
free_page_pinner(page, order);
|
||||
page_table_check_free(page, order);
|
||||
@@ -1372,7 +1374,6 @@ static void free_one_page(struct zone *zone, struct page *page,
|
||||
static void __free_pages_ok(struct page *page, unsigned int order,
|
||||
fpi_t fpi_flags)
|
||||
{
|
||||
unsigned long flags;
|
||||
int migratetype;
|
||||
unsigned long pfn = page_to_pfn(page);
|
||||
struct zone *zone = page_zone(page);
|
||||
@@ -1392,21 +1393,17 @@ skip_prepare:
|
||||
fpi_flags, &skip_free_pages_ok);
|
||||
if (skip_free_pages_ok)
|
||||
return;
|
||||
|
||||
spin_lock_irqsave(&zone->lock, flags);
|
||||
/*
|
||||
* Calling get_pfnblock_migratetype() without spin_lock_irqsave() here
|
||||
* is used to avoid calling get_pfnblock_migratetype() under the lock.
|
||||
* This will reduce the lock holding time.
|
||||
*/
|
||||
migratetype = get_pfnblock_migratetype(page, pfn);
|
||||
trace_android_vh_free_unref_page_bypass(page, order, migratetype, &skip_free_unref_page);
|
||||
if (skip_free_unref_page) {
|
||||
spin_unlock_irqrestore(&zone->lock, flags);
|
||||
if (skip_free_unref_page)
|
||||
return;
|
||||
}
|
||||
|
||||
if (unlikely(has_isolate_pageblock(zone) ||
|
||||
is_migrate_isolate(migratetype))) {
|
||||
migratetype = get_pfnblock_migratetype(page, pfn);
|
||||
}
|
||||
__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
|
||||
spin_unlock_irqrestore(&zone->lock, flags);
|
||||
free_one_page(zone, page, pfn, order, fpi_flags);
|
||||
|
||||
__count_vm_events(PGFREE, 1 << order);
|
||||
}
|
||||
@@ -2249,8 +2246,8 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
|
||||
struct zone *zone;
|
||||
struct page *page;
|
||||
int order;
|
||||
int ret;
|
||||
bool skip_unreserve_highatomic = false;
|
||||
int ret;
|
||||
|
||||
for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
|
||||
ac->nodemask) {
|
||||
@@ -2765,7 +2762,7 @@ void free_unref_page(struct page *page, unsigned int order)
|
||||
return;
|
||||
if (unlikely(migratetype > MIGRATE_RECLAIMABLE)) {
|
||||
if (unlikely(is_migrate_isolate(migratetype))) {
|
||||
free_one_page(page_zone(page), page, pfn, order, FPI_NONE);
|
||||
free_one_page(page_zone(page), page, pfn, order, FPI_NONE);
|
||||
return;
|
||||
}
|
||||
#ifdef CONFIG_CMA
|
||||
@@ -2781,64 +2778,65 @@ void free_unref_page(struct page *page, unsigned int order)
|
||||
free_unref_page_commit(zone, pcp, page, migratetype, order);
|
||||
pcp_spin_unlock(pcp);
|
||||
} else {
|
||||
free_one_page(zone, page, pfn, order, FPI_NONE);
|
||||
free_one_page(zone, page, pfn, order, FPI_NONE);
|
||||
}
|
||||
pcp_trylock_finish(UP_flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* Free a list of 0-order pages
|
||||
* Free a batch of folios
|
||||
*/
|
||||
void free_unref_page_list(struct list_head *list)
|
||||
void free_unref_folios(struct folio_batch *folios)
|
||||
{
|
||||
unsigned long __maybe_unused UP_flags;
|
||||
struct page *page, *next;
|
||||
struct per_cpu_pages *pcp = NULL;
|
||||
struct zone *locked_zone = NULL;
|
||||
int batch_count = 0;
|
||||
int migratetype;
|
||||
bool skip_free = false;
|
||||
int i, j;
|
||||
|
||||
/* Prepare pages for freeing */
|
||||
list_for_each_entry_safe(page, next, list, lru) {
|
||||
unsigned long pfn = page_to_pfn(page);
|
||||
if (!free_pages_prepare(page, 0, FPI_NONE)) {
|
||||
list_del(&page->lru);
|
||||
/* Prepare folios for freeing */
|
||||
for (i = 0, j = 0; i < folios->nr; i++) {
|
||||
struct folio *folio = folios->folios[i];
|
||||
unsigned long pfn = folio_pfn(folio);
|
||||
unsigned int order = folio_order(folio);
|
||||
|
||||
if (order > 0 && folio_test_large_rmappable(folio))
|
||||
folio_unqueue_deferred_split(folio);
|
||||
if (!free_pages_prepare(&folio->page, order, FPI_NONE))
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* Free isolated pages directly to the allocator, see
|
||||
* comment in free_unref_page.
|
||||
* Free orders not handled on the PCP directly to the
|
||||
* allocator.
|
||||
*/
|
||||
migratetype = get_pfnblock_migratetype(page, pfn);
|
||||
if (unlikely(is_migrate_isolate(migratetype))) {
|
||||
list_del(&page->lru);
|
||||
free_one_page(page_zone(page), page, pfn, 0, FPI_NONE);
|
||||
if (!pcp_allowed_order(order)) {
|
||||
free_one_page(folio_zone(folio), &folio->page,
|
||||
pfn, order, FPI_NONE);
|
||||
continue;
|
||||
}
|
||||
folio->private = (void *)(unsigned long)order;
|
||||
if (j != i)
|
||||
folios->folios[j] = folio;
|
||||
j++;
|
||||
}
|
||||
folios->nr = j;
|
||||
|
||||
trace_android_vh_free_unref_page_list_bypass(list, &skip_free);
|
||||
if (skip_free)
|
||||
return;
|
||||
for (i = 0; i < folios->nr; i++) {
|
||||
struct folio *folio = folios->folios[i];
|
||||
struct zone *zone = folio_zone(folio);
|
||||
unsigned long pfn = folio_pfn(folio);
|
||||
unsigned int order = (unsigned long)folio->private;
|
||||
int migratetype;
|
||||
|
||||
list_for_each_entry_safe(page, next, list, lru) {
|
||||
struct zone *zone = page_zone(page);
|
||||
unsigned long pfn = page_to_pfn(page);
|
||||
folio->private = NULL;
|
||||
migratetype = get_pfnblock_migratetype(&folio->page, pfn);
|
||||
|
||||
list_del(&page->lru);
|
||||
migratetype = get_pfnblock_migratetype(page, pfn);
|
||||
|
||||
/*
|
||||
* Either different zone requiring a different pcp lock or
|
||||
* excessive lock hold times when freeing a large list of
|
||||
* pages.
|
||||
*/
|
||||
if (zone != locked_zone || batch_count == SWAP_CLUSTER_MAX) {
|
||||
/* Different zone requires a different pcp lock */
|
||||
if (zone != locked_zone ||
|
||||
is_migrate_isolate(migratetype)) {
|
||||
if (pcp) {
|
||||
pcp_spin_unlock(pcp);
|
||||
pcp_trylock_finish(UP_flags);
|
||||
locked_zone = NULL;
|
||||
pcp = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2846,24 +2844,21 @@ void free_unref_page_list(struct list_head *list)
|
||||
* allocator, see comment in free_unref_page.
|
||||
*/
|
||||
if (is_migrate_isolate(migratetype)) {
|
||||
free_one_page(zone, page, page_to_pfn(page),
|
||||
0, FPI_NONE);
|
||||
free_one_page(zone, &folio->page, pfn,
|
||||
order, FPI_NONE);
|
||||
continue;
|
||||
}
|
||||
|
||||
batch_count = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* trylock is necessary as pages may be getting freed
|
||||
* trylock is necessary as folios may be getting freed
|
||||
* from IRQ or SoftIRQ context after an IO completion.
|
||||
*/
|
||||
pcp_trylock_prepare(UP_flags);
|
||||
pcp = pcp_spin_trylock(zone->per_cpu_pageset);
|
||||
if (unlikely(!pcp)) {
|
||||
pcp_trylock_finish(UP_flags);
|
||||
free_one_page(zone, page, pfn,
|
||||
0, FPI_NONE);
|
||||
locked_zone = NULL;
|
||||
free_one_page(zone, &folio->page, pfn,
|
||||
order, FPI_NONE);
|
||||
continue;
|
||||
}
|
||||
locked_zone = zone;
|
||||
@@ -2880,15 +2875,39 @@ void free_unref_page_list(struct list_head *list)
|
||||
migratetype = MIGRATE_MOVABLE;
|
||||
}
|
||||
|
||||
trace_mm_page_free_batched(page);
|
||||
free_unref_page_commit(zone, pcp, page, migratetype, 0);
|
||||
batch_count++;
|
||||
trace_mm_page_free_batched(&folio->page);
|
||||
free_unref_page_commit(zone, pcp, &folio->page, migratetype,
|
||||
order);
|
||||
}
|
||||
|
||||
if (pcp) {
|
||||
pcp_spin_unlock(pcp);
|
||||
pcp_trylock_finish(UP_flags);
|
||||
}
|
||||
folio_batch_reinit(folios);
|
||||
}
|
||||
|
||||
void free_unref_page_list(struct list_head *list)
|
||||
{
|
||||
struct folio_batch fbatch;
|
||||
bool skip_free = false;
|
||||
|
||||
trace_android_vh_free_unref_page_list_bypass(list, &skip_free);
|
||||
if (skip_free)
|
||||
return;
|
||||
|
||||
folio_batch_init(&fbatch);
|
||||
while (!list_empty(list)) {
|
||||
struct folio *folio = list_first_entry(list, struct folio, lru);
|
||||
|
||||
list_del(&folio->lru);
|
||||
if (folio_batch_add(&fbatch, folio) > 0)
|
||||
continue;
|
||||
free_unref_folios(&fbatch);
|
||||
}
|
||||
|
||||
if (fbatch.nr)
|
||||
free_unref_folios(&fbatch);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -3216,7 +3235,7 @@ static inline long __zone_watermark_unusable_free(struct zone *z,
|
||||
* watermark then subtract the free pages reserved for highatomic.
|
||||
*/
|
||||
if (likely(!(alloc_flags & ALLOC_RESERVES)))
|
||||
unusable_free += READ_ONCE(free_highatomics[zone_idx(z)]);
|
||||
unusable_free += READ_ONCE(nr_free_highatomic[zone_idx(z)]);
|
||||
|
||||
#ifdef CONFIG_CMA
|
||||
/* If allocation can't use CMA areas don't use free CMA pages */
|
||||
|
@@ -417,9 +417,9 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
|
||||
|
||||
ret = __alloc_contig_migrate_range(&cc, head_pfn,
|
||||
head_pfn + nr_pages, page_mt);
|
||||
|
||||
if (ret)
|
||||
goto failed;
|
||||
|
||||
pfn = head_pfn + nr_pages;
|
||||
continue;
|
||||
}
|
||||
|
@@ -270,6 +270,9 @@ static const struct vm_operations_struct pad_vma_ops = {
|
||||
.name = pad_vma_name,
|
||||
};
|
||||
|
||||
/* Defined in kernel/fork.c */
|
||||
extern struct kmem_cache *vm_area_cachep;
|
||||
|
||||
/*
|
||||
* Returns a new VMA representing the padding in @vma;
|
||||
* returns NULL if no padding in @vma or allocation failed.
|
||||
@@ -281,7 +284,7 @@ static struct vm_area_struct *get_pad_vma(struct vm_area_struct *vma)
|
||||
if (!is_pgsize_migration_enabled() || !(vma->vm_flags & VM_PAD_MASK))
|
||||
return NULL;
|
||||
|
||||
pad = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
|
||||
pad = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
|
||||
if (!pad) {
|
||||
pr_warn("Page size migration: Failed to allocate padding VMA");
|
||||
return NULL;
|
||||
@@ -347,7 +350,7 @@ void show_map_pad_vma(struct vm_area_struct *vma, struct seq_file *m,
|
||||
else
|
||||
((show_pad_maps_fn)func)(m, pad);
|
||||
|
||||
kfree(pad);
|
||||
kmem_cache_free(vm_area_cachep, pad);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@@ -1599,8 +1599,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
|
||||
* Check partially_mapped first to ensure it is a large folio.
|
||||
*/
|
||||
if (folio_test_anon(folio) && partially_mapped &&
|
||||
list_empty(&folio->_deferred_list))
|
||||
deferred_split_folio(folio);
|
||||
!folio_test_partially_mapped(folio))
|
||||
deferred_split_folio(folio, true);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@@ -342,7 +342,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
|
||||
K(low_wmark_pages(zone)),
|
||||
K(high_wmark_pages(zone)),
|
||||
K(zone->nr_reserved_highatomic),
|
||||
K(free_highatomics[zone_idx(zone)]),
|
||||
K(nr_free_highatomic[zone_idx(zone)]),
|
||||
K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
|
||||
K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
|
||||
K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
|
||||
|
171
mm/swap.c
171
mm/swap.c
@@ -77,26 +77,33 @@ static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = {
|
||||
.lock = INIT_LOCAL_LOCK(lock),
|
||||
};
|
||||
|
||||
static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp,
|
||||
unsigned long *flagsp)
|
||||
{
|
||||
if (folio_test_lru(folio)) {
|
||||
folio_lruvec_relock_irqsave(folio, lruvecp, flagsp);
|
||||
lruvec_del_folio(*lruvecp, folio);
|
||||
__folio_clear_lru_flags(folio);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This path almost never happens for VM activity - pages are normally freed
|
||||
* in batches. But it gets used by networking - and for compound pages.
|
||||
*/
|
||||
static void __page_cache_release(struct folio *folio)
|
||||
static void page_cache_release(struct folio *folio)
|
||||
{
|
||||
if (folio_test_lru(folio)) {
|
||||
struct lruvec *lruvec;
|
||||
unsigned long flags;
|
||||
struct lruvec *lruvec = NULL;
|
||||
unsigned long flags;
|
||||
|
||||
lruvec = folio_lruvec_lock_irqsave(folio, &flags);
|
||||
lruvec_del_folio(lruvec, folio);
|
||||
__folio_clear_lru_flags(folio);
|
||||
__page_cache_release(folio, &lruvec, &flags);
|
||||
if (lruvec)
|
||||
unlock_page_lruvec_irqrestore(lruvec, flags);
|
||||
}
|
||||
}
|
||||
|
||||
static void __folio_put_small(struct folio *folio)
|
||||
{
|
||||
__page_cache_release(folio);
|
||||
page_cache_release(folio);
|
||||
mem_cgroup_uncharge(folio);
|
||||
free_unref_page(&folio->page, 0);
|
||||
}
|
||||
@@ -110,7 +117,7 @@ static void __folio_put_large(struct folio *folio)
|
||||
* be called for hugetlb (it has a separate hugetlb_cgroup.)
|
||||
*/
|
||||
if (!folio_test_hugetlb(folio))
|
||||
__page_cache_release(folio);
|
||||
page_cache_release(folio);
|
||||
destroy_large_folio(folio);
|
||||
}
|
||||
|
||||
@@ -133,22 +140,25 @@ EXPORT_SYMBOL(__folio_put);
|
||||
*/
|
||||
void put_pages_list(struct list_head *pages)
|
||||
{
|
||||
struct folio_batch fbatch;
|
||||
struct folio *folio, *next;
|
||||
|
||||
folio_batch_init(&fbatch);
|
||||
list_for_each_entry_safe(folio, next, pages, lru) {
|
||||
if (!folio_put_testzero(folio)) {
|
||||
list_del(&folio->lru);
|
||||
if (!folio_put_testzero(folio))
|
||||
continue;
|
||||
}
|
||||
if (folio_test_large(folio)) {
|
||||
list_del(&folio->lru);
|
||||
__folio_put_large(folio);
|
||||
continue;
|
||||
}
|
||||
/* LRU flag must be clear because it's passed using the lru */
|
||||
if (folio_batch_add(&fbatch, folio) > 0)
|
||||
continue;
|
||||
free_unref_folios(&fbatch);
|
||||
}
|
||||
|
||||
free_unref_page_list(pages);
|
||||
if (fbatch.nr)
|
||||
free_unref_folios(&fbatch);
|
||||
INIT_LIST_HEAD(pages);
|
||||
}
|
||||
EXPORT_SYMBOL(put_pages_list);
|
||||
@@ -170,7 +180,7 @@ static void lru_add_fn(struct lruvec *lruvec, struct folio *folio)
|
||||
* while the LRU lock is held.
|
||||
*
|
||||
* (That is not true of __page_cache_release(), and not necessarily
|
||||
* true of release_pages(): but those only clear the mlocked flag after
|
||||
* true of folios_put(): but those only clear the mlocked flag after
|
||||
* folio_put_testzero() has excluded any other users of the folio.)
|
||||
*/
|
||||
if (folio_evictable(folio)) {
|
||||
@@ -208,7 +218,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
|
||||
if (move_fn != lru_add_fn && !folio_test_clear_lru(folio))
|
||||
continue;
|
||||
|
||||
lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags);
|
||||
folio_lruvec_relock_irqsave(folio, &lruvec, &flags);
|
||||
move_fn(lruvec, folio);
|
||||
|
||||
folio_set_lru(folio);
|
||||
@@ -216,8 +226,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
|
||||
|
||||
if (lruvec)
|
||||
unlock_page_lruvec_irqrestore(lruvec, flags);
|
||||
folios_put(fbatch->folios, folio_batch_count(fbatch));
|
||||
folio_batch_reinit(fbatch);
|
||||
folios_put(fbatch);
|
||||
}
|
||||
|
||||
static void folio_batch_add_and_move(struct folio_batch *fbatch,
|
||||
@@ -958,47 +967,29 @@ void lru_cache_disable(void)
|
||||
EXPORT_SYMBOL_GPL(lru_cache_disable);
|
||||
|
||||
/**
|
||||
* release_pages - batched put_page()
|
||||
* @arg: array of pages to release
|
||||
* @nr: number of pages
|
||||
* folios_put_refs - Reduce the reference count on a batch of folios.
|
||||
* @folios: The folios.
|
||||
* @refs: The number of refs to subtract from each folio.
|
||||
*
|
||||
* Decrement the reference count on all the pages in @arg. If it
|
||||
* fell to zero, remove the page from the LRU and free it.
|
||||
* Like folio_put(), but for a batch of folios. This is more efficient
|
||||
* than writing the loop yourself as it will optimise the locks which need
|
||||
* to be taken if the folios are freed. The folios batch is returned
|
||||
* empty and ready to be reused for another batch; there is no need
|
||||
* to reinitialise it. If @refs is NULL, we subtract one from each
|
||||
* folio refcount.
|
||||
*
|
||||
* Note that the argument can be an array of pages, encoded pages,
|
||||
* or folio pointers. We ignore any encoded bits, and turn any of
|
||||
* them into just a folio that gets free'd.
|
||||
* Context: May be called in process or interrupt context, but not in NMI
|
||||
* context. May be called while holding a spinlock.
|
||||
*/
|
||||
void release_pages(release_pages_arg arg, int nr)
|
||||
void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
|
||||
{
|
||||
int i;
|
||||
struct encoded_page **encoded = arg.encoded_pages;
|
||||
LIST_HEAD(pages_to_free);
|
||||
int i, j;
|
||||
struct lruvec *lruvec = NULL;
|
||||
unsigned long flags = 0;
|
||||
unsigned int lock_batch;
|
||||
|
||||
for (i = 0; i < nr; i++) {
|
||||
unsigned int nr_refs = 1;
|
||||
struct folio *folio;
|
||||
|
||||
/* Turn any of the argument types into a folio */
|
||||
folio = page_folio(encoded_page_ptr(encoded[i]));
|
||||
|
||||
/* Is our next entry actually "nr_pages" -> "nr_refs" ? */
|
||||
if (unlikely(encoded_page_flags(encoded[i]) &
|
||||
ENCODED_PAGE_BIT_NR_PAGES_NEXT))
|
||||
nr_refs = encoded_nr_pages(encoded[++i]);
|
||||
|
||||
/*
|
||||
* Make sure the IRQ-safe lock-holding time does not get
|
||||
* excessive with a continuous string of pages from the
|
||||
* same lruvec. The lock is held only if lruvec != NULL.
|
||||
*/
|
||||
if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) {
|
||||
unlock_page_lruvec_irqrestore(lruvec, flags);
|
||||
lruvec = NULL;
|
||||
}
|
||||
for (i = 0, j = 0; i < folios->nr; i++) {
|
||||
struct folio *folio = folios->folios[i];
|
||||
unsigned int nr_refs = refs ? refs[i] : 1;
|
||||
|
||||
if (is_huge_zero_page(&folio->page))
|
||||
continue;
|
||||
@@ -1018,34 +1009,73 @@ void release_pages(release_pages_arg arg, int nr)
|
||||
if (!folio_ref_sub_and_test(folio, nr_refs))
|
||||
continue;
|
||||
|
||||
if (folio_test_large(folio)) {
|
||||
/* hugetlb has its own memcg */
|
||||
if (folio_test_hugetlb(folio)) {
|
||||
if (lruvec) {
|
||||
unlock_page_lruvec_irqrestore(lruvec, flags);
|
||||
lruvec = NULL;
|
||||
}
|
||||
__folio_put_large(folio);
|
||||
free_huge_folio(folio);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (folio_test_lru(folio)) {
|
||||
struct lruvec *prev_lruvec = lruvec;
|
||||
folio_unqueue_deferred_split(folio);
|
||||
__page_cache_release(folio, &lruvec, &flags);
|
||||
|
||||
lruvec = folio_lruvec_relock_irqsave(folio, lruvec,
|
||||
&flags);
|
||||
if (prev_lruvec != lruvec)
|
||||
lock_batch = 0;
|
||||
|
||||
lruvec_del_folio(lruvec, folio);
|
||||
__folio_clear_lru_flags(folio);
|
||||
}
|
||||
|
||||
list_add(&folio->lru, &pages_to_free);
|
||||
if (j != i)
|
||||
folios->folios[j] = folio;
|
||||
j++;
|
||||
}
|
||||
if (lruvec)
|
||||
unlock_page_lruvec_irqrestore(lruvec, flags);
|
||||
if (!j) {
|
||||
folio_batch_reinit(folios);
|
||||
return;
|
||||
}
|
||||
|
||||
mem_cgroup_uncharge_list(&pages_to_free);
|
||||
free_unref_page_list(&pages_to_free);
|
||||
folios->nr = j;
|
||||
mem_cgroup_uncharge_folios(folios);
|
||||
free_unref_folios(folios);
|
||||
}
|
||||
EXPORT_SYMBOL(folios_put_refs);
|
||||
|
||||
/**
|
||||
* release_pages - batched put_page()
|
||||
* @arg: array of pages to release
|
||||
* @nr: number of pages
|
||||
*
|
||||
* Decrement the reference count on all the pages in @arg. If it
|
||||
* fell to zero, remove the page from the LRU and free it.
|
||||
*
|
||||
* Note that the argument can be an array of pages, encoded pages,
|
||||
* or folio pointers. We ignore any encoded bits, and turn any of
|
||||
* them into just a folio that gets free'd.
|
||||
*/
|
||||
void release_pages(release_pages_arg arg, int nr)
|
||||
{
|
||||
struct folio_batch fbatch;
|
||||
int refs[PAGEVEC_SIZE];
|
||||
struct encoded_page **encoded = arg.encoded_pages;
|
||||
int i;
|
||||
|
||||
folio_batch_init(&fbatch);
|
||||
for (i = 0; i < nr; i++) {
|
||||
/* Turn any of the argument types into a folio */
|
||||
struct folio *folio = page_folio(encoded_page_ptr(encoded[i]));
|
||||
|
||||
/* Is our next entry actually "nr_pages" -> "nr_refs" ? */
|
||||
refs[fbatch.nr] = 1;
|
||||
if (unlikely(encoded_page_flags(encoded[i]) &
|
||||
ENCODED_PAGE_BIT_NR_PAGES_NEXT))
|
||||
refs[fbatch.nr] = encoded_nr_pages(encoded[++i]);
|
||||
|
||||
if (folio_batch_add(&fbatch, folio) > 0)
|
||||
continue;
|
||||
folios_put_refs(&fbatch, refs);
|
||||
}
|
||||
|
||||
if (fbatch.nr)
|
||||
folios_put_refs(&fbatch, refs);
|
||||
}
|
||||
EXPORT_SYMBOL(release_pages);
|
||||
|
||||
@@ -1065,8 +1095,7 @@ void __folio_batch_release(struct folio_batch *fbatch)
|
||||
lru_add_drain();
|
||||
fbatch->percpu_pvec_drained = true;
|
||||
}
|
||||
release_pages(fbatch->folios, folio_batch_count(fbatch));
|
||||
folio_batch_reinit(fbatch);
|
||||
folios_put(fbatch);
|
||||
}
|
||||
EXPORT_SYMBOL(__folio_batch_release);
|
||||
|
||||
|
@@ -1358,6 +1358,7 @@ const char * const vmstat_text[] = {
|
||||
"thp_split_page",
|
||||
"thp_split_page_failed",
|
||||
"thp_deferred_split_page",
|
||||
"thp_underused_split_page",
|
||||
"thp_split_pmd",
|
||||
"thp_shatter_page",
|
||||
"thp_shatter_page_failed",
|
||||
|
@@ -17,7 +17,7 @@ obj-$(CONFIG_NETFILTER) += netfilter/
|
||||
obj-$(CONFIG_INET) += ipv4/
|
||||
obj-$(CONFIG_TLS) += tls/
|
||||
obj-$(CONFIG_XFRM) += xfrm/
|
||||
obj-$(CONFIG_UNIX_SCM) += unix/
|
||||
obj-$(CONFIG_UNIX) += unix/
|
||||
obj-y += ipv6/
|
||||
obj-$(CONFIG_BPFILTER) += bpfilter/
|
||||
obj-$(CONFIG_PACKET) += packet/
|
||||
|
@@ -36,6 +36,7 @@
|
||||
#include <net/compat.h>
|
||||
#include <net/scm.h>
|
||||
#include <net/cls_cgroup.h>
|
||||
#include <net/af_unix.h>
|
||||
|
||||
|
||||
/*
|
||||
@@ -85,8 +86,15 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
|
||||
return -ENOMEM;
|
||||
*fplp = fpl;
|
||||
fpl->count = 0;
|
||||
fpl->count_unix = 0;
|
||||
fpl->max = SCM_MAX_FD;
|
||||
fpl->user = NULL;
|
||||
#if IS_ENABLED(CONFIG_UNIX)
|
||||
fpl->inflight = false;
|
||||
fpl->dead = false;
|
||||
fpl->edges = NULL;
|
||||
INIT_LIST_HEAD(&fpl->vertices);
|
||||
#endif
|
||||
}
|
||||
fpp = &fpl->fp[fpl->count];
|
||||
|
||||
@@ -109,6 +117,9 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
|
||||
fput(file);
|
||||
return -EINVAL;
|
||||
}
|
||||
if (unix_get_socket(file))
|
||||
fpl->count_unix++;
|
||||
|
||||
*fpp++ = file;
|
||||
fpl->count++;
|
||||
}
|
||||
@@ -366,13 +377,18 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
|
||||
if (!fpl)
|
||||
return NULL;
|
||||
|
||||
new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]),
|
||||
new_fpl = kmemdup(fpl, sizeof(*fpl),
|
||||
GFP_KERNEL_ACCOUNT);
|
||||
if (new_fpl) {
|
||||
for (i = 0; i < fpl->count; i++)
|
||||
get_file(fpl->fp[i]);
|
||||
new_fpl->max = new_fpl->count;
|
||||
new_fpl->user = get_uid(fpl->user);
|
||||
#if IS_ENABLED(CONFIG_UNIX)
|
||||
new_fpl->inflight = false;
|
||||
new_fpl->edges = NULL;
|
||||
INIT_LIST_HEAD(&new_fpl->vertices);
|
||||
#endif
|
||||
}
|
||||
return new_fpl;
|
||||
}
|
||||
|
@@ -16,11 +16,6 @@ config UNIX
|
||||
|
||||
Say Y unless you know what you are doing.
|
||||
|
||||
config UNIX_SCM
|
||||
bool
|
||||
depends on UNIX
|
||||
default y
|
||||
|
||||
config AF_UNIX_OOB
|
||||
bool
|
||||
depends on UNIX
|
||||
|
@@ -11,5 +11,3 @@ unix-$(CONFIG_BPF_SYSCALL) += unix_bpf.o
|
||||
|
||||
obj-$(CONFIG_UNIX_DIAG) += unix_diag.o
|
||||
unix_diag-y := diag.o
|
||||
|
||||
obj-$(CONFIG_UNIX_SCM) += scm.o
|
||||
|
@@ -117,8 +117,6 @@
|
||||
#include <linux/file.h>
|
||||
#include <linux/btf_ids.h>
|
||||
|
||||
#include "scm.h"
|
||||
|
||||
static atomic_long_t unix_nr_socks;
|
||||
static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
|
||||
static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
|
||||
@@ -980,11 +978,11 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern,
|
||||
sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen);
|
||||
sk->sk_destruct = unix_sock_destructor;
|
||||
u = unix_sk(sk);
|
||||
u->inflight = 0;
|
||||
u->listener = NULL;
|
||||
u->vertex = NULL;
|
||||
u->path.dentry = NULL;
|
||||
u->path.mnt = NULL;
|
||||
spin_lock_init(&u->lock);
|
||||
INIT_LIST_HEAD(&u->link);
|
||||
mutex_init(&u->iolock); /* single task reading lock */
|
||||
mutex_init(&u->bindlock); /* single task binding lock */
|
||||
init_waitqueue_head(&u->peer_wait);
|
||||
@@ -1583,6 +1581,7 @@ restart:
|
||||
newsk->sk_type = sk->sk_type;
|
||||
init_peercred(newsk);
|
||||
newu = unix_sk(newsk);
|
||||
newu->listener = other;
|
||||
RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
|
||||
otheru = unix_sk(other);
|
||||
|
||||
@@ -1678,8 +1677,8 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
|
||||
bool kern)
|
||||
{
|
||||
struct sock *sk = sock->sk;
|
||||
struct sock *tsk;
|
||||
struct sk_buff *skb;
|
||||
struct sock *tsk;
|
||||
int err;
|
||||
|
||||
err = -EOPNOTSUPP;
|
||||
@@ -1709,6 +1708,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
|
||||
|
||||
/* attach accepted sock to socket */
|
||||
unix_state_lock(tsk);
|
||||
unix_update_edges(unix_sk(tsk));
|
||||
newsock->state = SS_CONNECTED;
|
||||
unix_sock_inherit_flags(sock, newsock);
|
||||
sock_graft(tsk, newsock);
|
||||
@@ -1752,51 +1752,65 @@ out:
|
||||
return err;
|
||||
}
|
||||
|
||||
/* The "user->unix_inflight" variable is protected by the garbage
|
||||
* collection lock, and we just read it locklessly here. If you go
|
||||
* over the limit, there might be a tiny race in actually noticing
|
||||
* it across threads. Tough.
|
||||
*/
|
||||
static inline bool too_many_unix_fds(struct task_struct *p)
|
||||
{
|
||||
struct user_struct *user = current_user();
|
||||
|
||||
if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
|
||||
return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
|
||||
return false;
|
||||
}
|
||||
|
||||
static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
|
||||
{
|
||||
if (too_many_unix_fds(current))
|
||||
return -ETOOMANYREFS;
|
||||
|
||||
/* Need to duplicate file references for the sake of garbage
|
||||
* collection. Otherwise a socket in the fps might become a
|
||||
* candidate for GC while the skb is not yet queued.
|
||||
*/
|
||||
UNIXCB(skb).fp = scm_fp_dup(scm->fp);
|
||||
if (!UNIXCB(skb).fp)
|
||||
return -ENOMEM;
|
||||
|
||||
if (unix_prepare_fpl(UNIXCB(skb).fp))
|
||||
return -ENOMEM;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
|
||||
{
|
||||
scm->fp = UNIXCB(skb).fp;
|
||||
UNIXCB(skb).fp = NULL;
|
||||
|
||||
unix_destroy_fpl(scm->fp);
|
||||
}
|
||||
|
||||
static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
|
||||
{
|
||||
scm->fp = scm_fp_dup(UNIXCB(skb).fp);
|
||||
}
|
||||
|
||||
/*
|
||||
* Garbage collection of unix sockets starts by selecting a set of
|
||||
* candidate sockets which have reference only from being in flight
|
||||
* (total_refs == inflight_refs). This condition is checked once during
|
||||
* the candidate collection phase, and candidates are marked as such, so
|
||||
* that non-candidates can later be ignored. While inflight_refs is
|
||||
* protected by unix_gc_lock, total_refs (file count) is not, hence this
|
||||
* is an instantaneous decision.
|
||||
*
|
||||
* Once a candidate, however, the socket must not be reinstalled into a
|
||||
* file descriptor while the garbage collection is in progress.
|
||||
*
|
||||
* If the above conditions are met, then the directed graph of
|
||||
* candidates (*) does not change while unix_gc_lock is held.
|
||||
*
|
||||
* Any operations that changes the file count through file descriptors
|
||||
* (dup, close, sendmsg) does not change the graph since candidates are
|
||||
* not installed in fds.
|
||||
*
|
||||
* Dequeing a candidate via recvmsg would install it into an fd, but
|
||||
* that takes unix_gc_lock to decrement the inflight count, so it's
|
||||
* serialized with garbage collection.
|
||||
*
|
||||
* MSG_PEEK is special in that it does not change the inflight count,
|
||||
* yet does install the socket into an fd. The following lock/unlock
|
||||
* pair is to ensure serialization with garbage collection. It must be
|
||||
* done between incrementing the file count and installing the file into
|
||||
* an fd.
|
||||
*
|
||||
* If garbage collection starts after the barrier provided by the
|
||||
* lock/unlock, then it will see the elevated refcount and not mark this
|
||||
* as a candidate. If a garbage collection is already in progress
|
||||
* before the file count was incremented, then the lock/unlock pair will
|
||||
* ensure that garbage collection is finished before progressing to
|
||||
* installing the fd.
|
||||
*
|
||||
* (*) A -> B where B is on the queue of A or B is on the queue of C
|
||||
* which is on the queue of listening socket A.
|
||||
*/
|
||||
spin_lock(&unix_gc_lock);
|
||||
spin_unlock(&unix_gc_lock);
|
||||
static void unix_destruct_scm(struct sk_buff *skb)
|
||||
{
|
||||
struct scm_cookie scm;
|
||||
|
||||
memset(&scm, 0, sizeof(scm));
|
||||
scm.pid = UNIXCB(skb).pid;
|
||||
if (UNIXCB(skb).fp)
|
||||
unix_detach_fds(&scm, skb);
|
||||
|
||||
/* Alas, it calls VFS */
|
||||
/* So fscking what? fput() had been SMP-safe since the last Summer */
|
||||
scm_destroy(&scm);
|
||||
sock_wfree(skb);
|
||||
}
|
||||
|
||||
static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
|
||||
@@ -1855,8 +1869,10 @@ static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
|
||||
struct scm_fp_list *fp = UNIXCB(skb).fp;
|
||||
struct unix_sock *u = unix_sk(sk);
|
||||
|
||||
if (unlikely(fp && fp->count))
|
||||
if (unlikely(fp && fp->count)) {
|
||||
atomic_add(fp->count, &u->scm_stat.nr_fds);
|
||||
unix_add_edges(fp, u);
|
||||
}
|
||||
}
|
||||
|
||||
static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
|
||||
@@ -1864,8 +1880,10 @@ static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
|
||||
struct scm_fp_list *fp = UNIXCB(skb).fp;
|
||||
struct unix_sock *u = unix_sk(sk);
|
||||
|
||||
if (unlikely(fp && fp->count))
|
||||
if (unlikely(fp && fp->count)) {
|
||||
atomic_sub(fp->count, &u->scm_stat.nr_fds);
|
||||
unix_del_edges(fp);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1885,11 +1903,12 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
|
||||
long timeo;
|
||||
int err;
|
||||
|
||||
wait_for_unix_gc();
|
||||
err = scm_send(sock, msg, &scm, false);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
wait_for_unix_gc(scm.fp);
|
||||
|
||||
err = -EOPNOTSUPP;
|
||||
if (msg->msg_flags&MSG_OOB)
|
||||
goto out;
|
||||
@@ -2157,11 +2176,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
|
||||
bool fds_sent = false;
|
||||
int data_len;
|
||||
|
||||
wait_for_unix_gc();
|
||||
err = scm_send(sock, msg, &scm, false);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
wait_for_unix_gc(scm.fp);
|
||||
|
||||
err = -EOPNOTSUPP;
|
||||
if (msg->msg_flags & MSG_OOB) {
|
||||
#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
|
||||
|
@@ -81,249 +81,519 @@
|
||||
#include <net/scm.h>
|
||||
#include <net/tcp_states.h>
|
||||
|
||||
#include "scm.h"
|
||||
|
||||
/* Internal data structures and random procedures: */
|
||||
|
||||
static LIST_HEAD(gc_candidates);
|
||||
|
||||
static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
|
||||
struct sk_buff_head *hitlist)
|
||||
struct unix_sock *unix_get_socket(struct file *filp)
|
||||
{
|
||||
struct sk_buff *skb;
|
||||
struct sk_buff *next;
|
||||
struct inode *inode = file_inode(filp);
|
||||
|
||||
spin_lock(&x->sk_receive_queue.lock);
|
||||
skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
|
||||
/* Do we have file descriptors ? */
|
||||
if (UNIXCB(skb).fp) {
|
||||
bool hit = false;
|
||||
/* Process the descriptors of this socket */
|
||||
int nfd = UNIXCB(skb).fp->count;
|
||||
struct file **fp = UNIXCB(skb).fp->fp;
|
||||
/* Socket ? */
|
||||
if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) {
|
||||
struct socket *sock = SOCKET_I(inode);
|
||||
const struct proto_ops *ops;
|
||||
struct sock *sk = sock->sk;
|
||||
|
||||
while (nfd--) {
|
||||
/* Get the socket the fd matches if it indeed does so */
|
||||
struct unix_sock *u = unix_get_socket(*fp++);
|
||||
ops = READ_ONCE(sock->ops);
|
||||
|
||||
/* Ignore non-candidates, they could have been added
|
||||
* to the queues after starting the garbage collection
|
||||
*/
|
||||
if (u && test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) {
|
||||
hit = true;
|
||||
|
||||
func(u);
|
||||
}
|
||||
}
|
||||
if (hit && hitlist != NULL) {
|
||||
__skb_unlink(skb, &x->sk_receive_queue);
|
||||
__skb_queue_tail(hitlist, skb);
|
||||
}
|
||||
}
|
||||
/* PF_UNIX ? */
|
||||
if (sk && ops && ops->family == PF_UNIX)
|
||||
return unix_sk(sk);
|
||||
}
|
||||
spin_unlock(&x->sk_receive_queue.lock);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void scan_children(struct sock *x, void (*func)(struct unix_sock *),
|
||||
struct sk_buff_head *hitlist)
|
||||
static struct unix_vertex *unix_edge_successor(struct unix_edge *edge)
|
||||
{
|
||||
if (x->sk_state != TCP_LISTEN) {
|
||||
scan_inflight(x, func, hitlist);
|
||||
} else {
|
||||
struct sk_buff *skb;
|
||||
struct sk_buff *next;
|
||||
struct unix_sock *u;
|
||||
LIST_HEAD(embryos);
|
||||
|
||||
/* For a listening socket collect the queued embryos
|
||||
* and perform a scan on them as well.
|
||||
*/
|
||||
spin_lock(&x->sk_receive_queue.lock);
|
||||
skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
|
||||
u = unix_sk(skb->sk);
|
||||
|
||||
/* An embryo cannot be in-flight, so it's safe
|
||||
* to use the list link.
|
||||
*/
|
||||
BUG_ON(!list_empty(&u->link));
|
||||
list_add_tail(&u->link, &embryos);
|
||||
}
|
||||
spin_unlock(&x->sk_receive_queue.lock);
|
||||
|
||||
while (!list_empty(&embryos)) {
|
||||
u = list_entry(embryos.next, struct unix_sock, link);
|
||||
scan_inflight(&u->sk, func, hitlist);
|
||||
list_del_init(&u->link);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void dec_inflight(struct unix_sock *usk)
|
||||
{
|
||||
usk->inflight--;
|
||||
}
|
||||
|
||||
static void inc_inflight(struct unix_sock *usk)
|
||||
{
|
||||
usk->inflight++;
|
||||
}
|
||||
|
||||
static void inc_inflight_move_tail(struct unix_sock *u)
|
||||
{
|
||||
u->inflight++;
|
||||
|
||||
/* If this still might be part of a cycle, move it to the end
|
||||
* of the list, so that it's checked even if it was already
|
||||
* passed over
|
||||
/* If an embryo socket has a fd,
|
||||
* the listener indirectly holds the fd's refcnt.
|
||||
*/
|
||||
if (test_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags))
|
||||
list_move_tail(&u->link, &gc_candidates);
|
||||
if (edge->successor->listener)
|
||||
return unix_sk(edge->successor->listener)->vertex;
|
||||
|
||||
return edge->successor->vertex;
|
||||
}
|
||||
|
||||
static bool unix_graph_maybe_cyclic;
|
||||
static bool unix_graph_grouped;
|
||||
|
||||
static void unix_update_graph(struct unix_vertex *vertex)
|
||||
{
|
||||
/* If the receiver socket is not inflight, no cyclic
|
||||
* reference could be formed.
|
||||
*/
|
||||
if (!vertex)
|
||||
return;
|
||||
|
||||
unix_graph_maybe_cyclic = true;
|
||||
unix_graph_grouped = false;
|
||||
}
|
||||
|
||||
static LIST_HEAD(unix_unvisited_vertices);
|
||||
|
||||
enum unix_vertex_index {
|
||||
UNIX_VERTEX_INDEX_MARK1,
|
||||
UNIX_VERTEX_INDEX_MARK2,
|
||||
UNIX_VERTEX_INDEX_START,
|
||||
};
|
||||
|
||||
static unsigned long unix_vertex_unvisited_index = UNIX_VERTEX_INDEX_MARK1;
|
||||
|
||||
static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
|
||||
{
|
||||
struct unix_vertex *vertex = edge->predecessor->vertex;
|
||||
|
||||
if (!vertex) {
|
||||
vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry);
|
||||
vertex->index = unix_vertex_unvisited_index;
|
||||
vertex->out_degree = 0;
|
||||
INIT_LIST_HEAD(&vertex->edges);
|
||||
INIT_LIST_HEAD(&vertex->scc_entry);
|
||||
|
||||
list_move_tail(&vertex->entry, &unix_unvisited_vertices);
|
||||
edge->predecessor->vertex = vertex;
|
||||
}
|
||||
|
||||
vertex->out_degree++;
|
||||
list_add_tail(&edge->vertex_entry, &vertex->edges);
|
||||
|
||||
unix_update_graph(unix_edge_successor(edge));
|
||||
}
|
||||
|
||||
static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
|
||||
{
|
||||
struct unix_vertex *vertex = edge->predecessor->vertex;
|
||||
|
||||
if (!fpl->dead)
|
||||
unix_update_graph(unix_edge_successor(edge));
|
||||
|
||||
list_del(&edge->vertex_entry);
|
||||
vertex->out_degree--;
|
||||
|
||||
if (!vertex->out_degree) {
|
||||
edge->predecessor->vertex = NULL;
|
||||
list_move_tail(&vertex->entry, &fpl->vertices);
|
||||
}
|
||||
}
|
||||
|
||||
static void unix_free_vertices(struct scm_fp_list *fpl)
|
||||
{
|
||||
struct unix_vertex *vertex, *next_vertex;
|
||||
|
||||
list_for_each_entry_safe(vertex, next_vertex, &fpl->vertices, entry) {
|
||||
list_del(&vertex->entry);
|
||||
kfree(vertex);
|
||||
}
|
||||
}
|
||||
|
||||
static DEFINE_SPINLOCK(unix_gc_lock);
|
||||
unsigned int unix_tot_inflight;
|
||||
|
||||
void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver)
|
||||
{
|
||||
int i = 0, j = 0;
|
||||
|
||||
spin_lock(&unix_gc_lock);
|
||||
|
||||
if (!fpl->count_unix)
|
||||
goto out;
|
||||
|
||||
do {
|
||||
struct unix_sock *inflight = unix_get_socket(fpl->fp[j++]);
|
||||
struct unix_edge *edge;
|
||||
|
||||
if (!inflight)
|
||||
continue;
|
||||
|
||||
edge = fpl->edges + i++;
|
||||
edge->predecessor = inflight;
|
||||
edge->successor = receiver;
|
||||
|
||||
unix_add_edge(fpl, edge);
|
||||
} while (i < fpl->count_unix);
|
||||
|
||||
receiver->scm_stat.nr_unix_fds += fpl->count_unix;
|
||||
WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + fpl->count_unix);
|
||||
out:
|
||||
WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight + fpl->count);
|
||||
|
||||
spin_unlock(&unix_gc_lock);
|
||||
|
||||
fpl->inflight = true;
|
||||
|
||||
unix_free_vertices(fpl);
|
||||
}
|
||||
|
||||
void unix_del_edges(struct scm_fp_list *fpl)
|
||||
{
|
||||
struct unix_sock *receiver;
|
||||
int i = 0;
|
||||
|
||||
spin_lock(&unix_gc_lock);
|
||||
|
||||
if (!fpl->count_unix)
|
||||
goto out;
|
||||
|
||||
do {
|
||||
struct unix_edge *edge = fpl->edges + i++;
|
||||
|
||||
unix_del_edge(fpl, edge);
|
||||
} while (i < fpl->count_unix);
|
||||
|
||||
if (!fpl->dead) {
|
||||
receiver = fpl->edges[0].successor;
|
||||
receiver->scm_stat.nr_unix_fds -= fpl->count_unix;
|
||||
}
|
||||
WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - fpl->count_unix);
|
||||
out:
|
||||
WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count);
|
||||
|
||||
spin_unlock(&unix_gc_lock);
|
||||
|
||||
fpl->inflight = false;
|
||||
}
|
||||
|
||||
void unix_update_edges(struct unix_sock *receiver)
|
||||
{
|
||||
/* nr_unix_fds is only updated under unix_state_lock().
|
||||
* If it's 0 here, the embryo socket is not part of the
|
||||
* inflight graph, and GC will not see it, so no lock needed.
|
||||
*/
|
||||
if (!receiver->scm_stat.nr_unix_fds) {
|
||||
receiver->listener = NULL;
|
||||
} else {
|
||||
spin_lock(&unix_gc_lock);
|
||||
unix_update_graph(unix_sk(receiver->listener)->vertex);
|
||||
receiver->listener = NULL;
|
||||
spin_unlock(&unix_gc_lock);
|
||||
}
|
||||
}
|
||||
|
||||
int unix_prepare_fpl(struct scm_fp_list *fpl)
|
||||
{
|
||||
struct unix_vertex *vertex;
|
||||
int i;
|
||||
|
||||
if (!fpl->count_unix)
|
||||
return 0;
|
||||
|
||||
for (i = 0; i < fpl->count_unix; i++) {
|
||||
vertex = kmalloc(sizeof(*vertex), GFP_KERNEL);
|
||||
if (!vertex)
|
||||
goto err;
|
||||
|
||||
list_add(&vertex->entry, &fpl->vertices);
|
||||
}
|
||||
|
||||
fpl->edges = kvmalloc_array(fpl->count_unix, sizeof(*fpl->edges),
|
||||
GFP_KERNEL_ACCOUNT);
|
||||
if (!fpl->edges)
|
||||
goto err;
|
||||
|
||||
return 0;
|
||||
|
||||
err:
|
||||
unix_free_vertices(fpl);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
void unix_destroy_fpl(struct scm_fp_list *fpl)
|
||||
{
|
||||
if (fpl->inflight)
|
||||
unix_del_edges(fpl);
|
||||
|
||||
kvfree(fpl->edges);
|
||||
unix_free_vertices(fpl);
|
||||
}
|
||||
|
||||
static bool unix_vertex_dead(struct unix_vertex *vertex)
|
||||
{
|
||||
struct unix_edge *edge;
|
||||
struct unix_sock *u;
|
||||
long total_ref;
|
||||
|
||||
list_for_each_entry(edge, &vertex->edges, vertex_entry) {
|
||||
struct unix_vertex *next_vertex = unix_edge_successor(edge);
|
||||
|
||||
/* The vertex's fd can be received by a non-inflight socket. */
|
||||
if (!next_vertex)
|
||||
return false;
|
||||
|
||||
/* The vertex's fd can be received by an inflight socket in
|
||||
* another SCC.
|
||||
*/
|
||||
if (next_vertex->scc_index != vertex->scc_index)
|
||||
return false;
|
||||
}
|
||||
|
||||
/* No receiver exists out of the same SCC. */
|
||||
|
||||
edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry);
|
||||
u = edge->predecessor;
|
||||
total_ref = file_count(u->sk.sk_socket->file);
|
||||
|
||||
/* If not close()d, total_ref > out_degree. */
|
||||
if (total_ref != vertex->out_degree)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
enum unix_recv_queue_lock_class {
|
||||
U_RECVQ_LOCK_NORMAL,
|
||||
U_RECVQ_LOCK_EMBRYO,
|
||||
};
|
||||
|
||||
static void unix_collect_queue(struct unix_sock *u, struct sk_buff_head *hitlist)
|
||||
{
|
||||
skb_queue_splice_init(&u->sk.sk_receive_queue, hitlist);
|
||||
|
||||
#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
|
||||
if (u->oob_skb) {
|
||||
WARN_ON_ONCE(skb_unref(u->oob_skb));
|
||||
u->oob_skb = NULL;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist)
|
||||
{
|
||||
struct unix_vertex *vertex;
|
||||
|
||||
list_for_each_entry_reverse(vertex, scc, scc_entry) {
|
||||
struct sk_buff_head *queue;
|
||||
struct unix_edge *edge;
|
||||
struct unix_sock *u;
|
||||
|
||||
edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry);
|
||||
u = edge->predecessor;
|
||||
queue = &u->sk.sk_receive_queue;
|
||||
|
||||
spin_lock(&queue->lock);
|
||||
|
||||
if (u->sk.sk_state == TCP_LISTEN) {
|
||||
struct sk_buff *skb;
|
||||
|
||||
skb_queue_walk(queue, skb) {
|
||||
struct sk_buff_head *embryo_queue = &skb->sk->sk_receive_queue;
|
||||
|
||||
/* listener -> embryo order, the inversion never happens. */
|
||||
spin_lock_nested(&embryo_queue->lock, U_RECVQ_LOCK_EMBRYO);
|
||||
unix_collect_queue(unix_sk(skb->sk), hitlist);
|
||||
spin_unlock(&embryo_queue->lock);
|
||||
}
|
||||
} else {
|
||||
unix_collect_queue(u, hitlist);
|
||||
}
|
||||
|
||||
spin_unlock(&queue->lock);
|
||||
}
|
||||
}
|
||||
|
||||
static bool unix_scc_cyclic(struct list_head *scc)
|
||||
{
|
||||
struct unix_vertex *vertex;
|
||||
struct unix_edge *edge;
|
||||
|
||||
/* SCC containing multiple vertices ? */
|
||||
if (!list_is_singular(scc))
|
||||
return true;
|
||||
|
||||
vertex = list_first_entry(scc, typeof(*vertex), scc_entry);
|
||||
|
||||
/* Self-reference or a embryo-listener circle ? */
|
||||
list_for_each_entry(edge, &vertex->edges, vertex_entry) {
|
||||
if (unix_edge_successor(edge) == vertex)
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static LIST_HEAD(unix_visited_vertices);
|
||||
static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2;
|
||||
|
||||
static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index,
|
||||
struct sk_buff_head *hitlist)
|
||||
{
|
||||
LIST_HEAD(vertex_stack);
|
||||
struct unix_edge *edge;
|
||||
LIST_HEAD(edge_stack);
|
||||
|
||||
next_vertex:
|
||||
/* Push vertex to vertex_stack and mark it as on-stack
|
||||
* (index >= UNIX_VERTEX_INDEX_START).
|
||||
* The vertex will be popped when finalising SCC later.
|
||||
*/
|
||||
list_add(&vertex->scc_entry, &vertex_stack);
|
||||
|
||||
vertex->index = *last_index;
|
||||
vertex->scc_index = *last_index;
|
||||
(*last_index)++;
|
||||
|
||||
/* Explore neighbour vertices (receivers of the current vertex's fd). */
|
||||
list_for_each_entry(edge, &vertex->edges, vertex_entry) {
|
||||
struct unix_vertex *next_vertex = unix_edge_successor(edge);
|
||||
|
||||
if (!next_vertex)
|
||||
continue;
|
||||
|
||||
if (next_vertex->index == unix_vertex_unvisited_index) {
|
||||
/* Iterative deepening depth first search
|
||||
*
|
||||
* 1. Push a forward edge to edge_stack and set
|
||||
* the successor to vertex for the next iteration.
|
||||
*/
|
||||
list_add(&edge->stack_entry, &edge_stack);
|
||||
|
||||
vertex = next_vertex;
|
||||
goto next_vertex;
|
||||
|
||||
/* 2. Pop the edge directed to the current vertex
|
||||
* and restore the ancestor for backtracking.
|
||||
*/
|
||||
prev_vertex:
|
||||
edge = list_first_entry(&edge_stack, typeof(*edge), stack_entry);
|
||||
list_del_init(&edge->stack_entry);
|
||||
|
||||
next_vertex = vertex;
|
||||
vertex = edge->predecessor->vertex;
|
||||
|
||||
/* If the successor has a smaller scc_index, two vertices
|
||||
* are in the same SCC, so propagate the smaller scc_index
|
||||
* to skip SCC finalisation.
|
||||
*/
|
||||
vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index);
|
||||
} else if (next_vertex->index != unix_vertex_grouped_index) {
|
||||
/* Loop detected by a back/cross edge.
|
||||
*
|
||||
* The successor is on vertex_stack, so two vertices are in
|
||||
* the same SCC. If the successor has a smaller *scc_index*,
|
||||
* propagate it to skip SCC finalisation.
|
||||
*/
|
||||
vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index);
|
||||
} else {
|
||||
/* The successor was already grouped as another SCC */
|
||||
}
|
||||
}
|
||||
|
||||
if (vertex->index == vertex->scc_index) {
|
||||
struct unix_vertex *v;
|
||||
struct list_head scc;
|
||||
bool scc_dead = true;
|
||||
|
||||
/* SCC finalised.
|
||||
*
|
||||
* If the scc_index was not updated, all the vertices above on
|
||||
* vertex_stack are in the same SCC. Group them using scc_entry.
|
||||
*/
|
||||
__list_cut_position(&scc, &vertex_stack, &vertex->scc_entry);
|
||||
|
||||
list_for_each_entry_reverse(v, &scc, scc_entry) {
|
||||
/* Don't restart DFS from this vertex in unix_walk_scc(). */
|
||||
list_move_tail(&v->entry, &unix_visited_vertices);
|
||||
|
||||
/* Mark vertex as off-stack. */
|
||||
v->index = unix_vertex_grouped_index;
|
||||
|
||||
if (scc_dead)
|
||||
scc_dead = unix_vertex_dead(v);
|
||||
}
|
||||
|
||||
if (scc_dead)
|
||||
unix_collect_skb(&scc, hitlist);
|
||||
else if (!unix_graph_maybe_cyclic)
|
||||
unix_graph_maybe_cyclic = unix_scc_cyclic(&scc);
|
||||
|
||||
list_del(&scc);
|
||||
}
|
||||
|
||||
/* Need backtracking ? */
|
||||
if (!list_empty(&edge_stack))
|
||||
goto prev_vertex;
|
||||
}
|
||||
|
||||
static void unix_walk_scc(struct sk_buff_head *hitlist)
|
||||
{
|
||||
unsigned long last_index = UNIX_VERTEX_INDEX_START;
|
||||
|
||||
unix_graph_maybe_cyclic = false;
|
||||
|
||||
/* Visit every vertex exactly once.
|
||||
* __unix_walk_scc() moves visited vertices to unix_visited_vertices.
|
||||
*/
|
||||
while (!list_empty(&unix_unvisited_vertices)) {
|
||||
struct unix_vertex *vertex;
|
||||
|
||||
vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
|
||||
__unix_walk_scc(vertex, &last_index, hitlist);
|
||||
}
|
||||
|
||||
list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
|
||||
swap(unix_vertex_unvisited_index, unix_vertex_grouped_index);
|
||||
|
||||
unix_graph_grouped = true;
|
||||
}
|
||||
|
||||
static void unix_walk_scc_fast(struct sk_buff_head *hitlist)
|
||||
{
|
||||
unix_graph_maybe_cyclic = false;
|
||||
|
||||
while (!list_empty(&unix_unvisited_vertices)) {
|
||||
struct unix_vertex *vertex;
|
||||
struct list_head scc;
|
||||
bool scc_dead = true;
|
||||
|
||||
vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
|
||||
list_add(&scc, &vertex->scc_entry);
|
||||
|
||||
list_for_each_entry_reverse(vertex, &scc, scc_entry) {
|
||||
list_move_tail(&vertex->entry, &unix_visited_vertices);
|
||||
|
||||
if (scc_dead)
|
||||
scc_dead = unix_vertex_dead(vertex);
|
||||
}
|
||||
|
||||
if (scc_dead)
|
||||
unix_collect_skb(&scc, hitlist);
|
||||
else if (!unix_graph_maybe_cyclic)
|
||||
unix_graph_maybe_cyclic = unix_scc_cyclic(&scc);
|
||||
|
||||
list_del(&scc);
|
||||
}
|
||||
|
||||
list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
|
||||
}
|
||||
|
||||
static bool gc_in_progress;
|
||||
|
||||
static void __unix_gc(struct work_struct *work)
|
||||
{
|
||||
struct sk_buff *next_skb, *skb;
|
||||
struct unix_sock *u;
|
||||
struct unix_sock *next;
|
||||
struct sk_buff_head hitlist;
|
||||
struct list_head cursor;
|
||||
LIST_HEAD(not_cycle_list);
|
||||
struct sk_buff *skb;
|
||||
|
||||
spin_lock(&unix_gc_lock);
|
||||
|
||||
/* First, select candidates for garbage collection. Only
|
||||
* in-flight sockets are considered, and from those only ones
|
||||
* which don't have any external reference.
|
||||
*
|
||||
* Holding unix_gc_lock will protect these candidates from
|
||||
* being detached, and hence from gaining an external
|
||||
* reference. Since there are no possible receivers, all
|
||||
* buffers currently on the candidates' queues stay there
|
||||
* during the garbage collection.
|
||||
*
|
||||
* We also know that no new candidate can be added onto the
|
||||
* receive queues. Other, non candidate sockets _can_ be
|
||||
* added to queue, so we must make sure only to touch
|
||||
* candidates.
|
||||
*
|
||||
* Embryos, though never candidates themselves, affect which
|
||||
* candidates are reachable by the garbage collector. Before
|
||||
* being added to a listener's queue, an embryo may already
|
||||
* receive data carrying SCM_RIGHTS, potentially making the
|
||||
* passed socket a candidate that is not yet reachable by the
|
||||
* collector. It becomes reachable once the embryo is
|
||||
* enqueued. Therefore, we must ensure that no SCM-laden
|
||||
* embryo appears in a (candidate) listener's queue between
|
||||
* consecutive scan_children() calls.
|
||||
*/
|
||||
list_for_each_entry_safe(u, next, &gc_inflight_list, link) {
|
||||
struct sock *sk = &u->sk;
|
||||
long total_refs;
|
||||
|
||||
total_refs = file_count(sk->sk_socket->file);
|
||||
|
||||
BUG_ON(!u->inflight);
|
||||
BUG_ON(total_refs < u->inflight);
|
||||
if (total_refs == u->inflight) {
|
||||
list_move_tail(&u->link, &gc_candidates);
|
||||
__set_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
|
||||
__set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
|
||||
|
||||
if (sk->sk_state == TCP_LISTEN) {
|
||||
unix_state_lock_nested(sk, U_LOCK_GC_LISTENER);
|
||||
unix_state_unlock(sk);
|
||||
}
|
||||
}
|
||||
if (!unix_graph_maybe_cyclic) {
|
||||
spin_unlock(&unix_gc_lock);
|
||||
goto skip_gc;
|
||||
}
|
||||
|
||||
/* Now remove all internal in-flight reference to children of
|
||||
* the candidates.
|
||||
*/
|
||||
list_for_each_entry(u, &gc_candidates, link)
|
||||
scan_children(&u->sk, dec_inflight, NULL);
|
||||
__skb_queue_head_init(&hitlist);
|
||||
|
||||
/* Restore the references for children of all candidates,
|
||||
* which have remaining references. Do this recursively, so
|
||||
* only those remain, which form cyclic references.
|
||||
*
|
||||
* Use a "cursor" link, to make the list traversal safe, even
|
||||
* though elements might be moved about.
|
||||
*/
|
||||
list_add(&cursor, &gc_candidates);
|
||||
while (cursor.next != &gc_candidates) {
|
||||
u = list_entry(cursor.next, struct unix_sock, link);
|
||||
|
||||
/* Move cursor to after the current position. */
|
||||
list_move(&cursor, &u->link);
|
||||
|
||||
if (u->inflight) {
|
||||
list_move_tail(&u->link, ¬_cycle_list);
|
||||
__clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
|
||||
scan_children(&u->sk, inc_inflight_move_tail, NULL);
|
||||
}
|
||||
}
|
||||
list_del(&cursor);
|
||||
|
||||
/* Now gc_candidates contains only garbage. Restore original
|
||||
* inflight counters for these as well, and remove the skbuffs
|
||||
* which are creating the cycle(s).
|
||||
*/
|
||||
skb_queue_head_init(&hitlist);
|
||||
list_for_each_entry(u, &gc_candidates, link) {
|
||||
scan_children(&u->sk, inc_inflight, &hitlist);
|
||||
|
||||
#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
|
||||
if (u->oob_skb) {
|
||||
kfree_skb(u->oob_skb);
|
||||
u->oob_skb = NULL;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/* not_cycle_list contains those sockets which do not make up a
|
||||
* cycle. Restore these to the inflight list.
|
||||
*/
|
||||
while (!list_empty(¬_cycle_list)) {
|
||||
u = list_entry(not_cycle_list.next, struct unix_sock, link);
|
||||
__clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
|
||||
list_move_tail(&u->link, &gc_inflight_list);
|
||||
}
|
||||
if (unix_graph_grouped)
|
||||
unix_walk_scc_fast(&hitlist);
|
||||
else
|
||||
unix_walk_scc(&hitlist);
|
||||
|
||||
spin_unlock(&unix_gc_lock);
|
||||
|
||||
/* We need io_uring to clean its registered files, ignore all io_uring
|
||||
* originated skbs. It's fine as io_uring doesn't keep references to
|
||||
* other io_uring instances and so killing all other files in the cycle
|
||||
* will put all io_uring references forcing it to go through normal
|
||||
* release.path eventually putting registered files.
|
||||
*/
|
||||
skb_queue_walk_safe(&hitlist, skb, next_skb) {
|
||||
if (skb->destructor == io_uring_destruct_scm) {
|
||||
__skb_unlink(skb, &hitlist);
|
||||
skb_queue_tail(&skb->sk->sk_receive_queue, skb);
|
||||
}
|
||||
skb_queue_walk(&hitlist, skb) {
|
||||
if (UNIXCB(skb).fp)
|
||||
UNIXCB(skb).fp->dead = true;
|
||||
}
|
||||
|
||||
/* Here we are. Hitlist is filled. Die. */
|
||||
__skb_queue_purge(&hitlist);
|
||||
|
||||
spin_lock(&unix_gc_lock);
|
||||
|
||||
/* There could be io_uring registered files, just push them back to
|
||||
* the inflight list
|
||||
*/
|
||||
list_for_each_entry_safe(u, next, &gc_candidates, link)
|
||||
list_move_tail(&u->link, &gc_inflight_list);
|
||||
|
||||
/* All candidates should have been detached by now. */
|
||||
BUG_ON(!list_empty(&gc_candidates));
|
||||
|
||||
/* Paired with READ_ONCE() in wait_for_unix_gc(). */
|
||||
skip_gc:
|
||||
WRITE_ONCE(gc_in_progress, false);
|
||||
|
||||
spin_unlock(&unix_gc_lock);
|
||||
}
|
||||
|
||||
static DECLARE_WORK(unix_gc_work, __unix_gc);
|
||||
@@ -335,8 +605,9 @@ void unix_gc(void)
|
||||
}
|
||||
|
||||
#define UNIX_INFLIGHT_TRIGGER_GC 16000
|
||||
#define UNIX_INFLIGHT_SANE_USER (SCM_MAX_FD * 8)
|
||||
|
||||
void wait_for_unix_gc(void)
|
||||
void wait_for_unix_gc(struct scm_fp_list *fpl)
|
||||
{
|
||||
/* If number of inflight sockets is insane,
|
||||
* force a garbage collect right now.
|
||||
@@ -348,6 +619,13 @@ void wait_for_unix_gc(void)
|
||||
!READ_ONCE(gc_in_progress))
|
||||
unix_gc();
|
||||
|
||||
/* Penalise users who want to send AF_UNIX sockets
|
||||
* but whose sockets have not been received yet.
|
||||
*/
|
||||
if (!fpl || !fpl->count_unix ||
|
||||
READ_ONCE(fpl->user->unix_inflight) < UNIX_INFLIGHT_SANE_USER)
|
||||
return;
|
||||
|
||||
if (READ_ONCE(gc_in_progress))
|
||||
flush_work(&unix_gc_work);
|
||||
}
|
||||
|
156
net/unix/scm.c
156
net/unix/scm.c
@@ -1,156 +0,0 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/module.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/socket.h>
|
||||
#include <linux/net.h>
|
||||
#include <linux/fs.h>
|
||||
#include <net/af_unix.h>
|
||||
#include <net/scm.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/io_uring.h>
|
||||
|
||||
#include "scm.h"
|
||||
|
||||
unsigned int unix_tot_inflight;
|
||||
EXPORT_SYMBOL(unix_tot_inflight);
|
||||
|
||||
LIST_HEAD(gc_inflight_list);
|
||||
EXPORT_SYMBOL(gc_inflight_list);
|
||||
|
||||
DEFINE_SPINLOCK(unix_gc_lock);
|
||||
EXPORT_SYMBOL(unix_gc_lock);
|
||||
|
||||
struct unix_sock *unix_get_socket(struct file *filp)
|
||||
{
|
||||
struct inode *inode = file_inode(filp);
|
||||
|
||||
/* Socket ? */
|
||||
if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) {
|
||||
struct socket *sock = SOCKET_I(inode);
|
||||
const struct proto_ops *ops = READ_ONCE(sock->ops);
|
||||
struct sock *s = sock->sk;
|
||||
|
||||
/* PF_UNIX ? */
|
||||
if (s && ops && ops->family == PF_UNIX)
|
||||
return unix_sk(s);
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
EXPORT_SYMBOL(unix_get_socket);
|
||||
|
||||
/* Keep the number of times in flight count for the file
|
||||
* descriptor if it is for an AF_UNIX socket.
|
||||
*/
|
||||
void unix_inflight(struct user_struct *user, struct file *fp)
|
||||
{
|
||||
struct unix_sock *u = unix_get_socket(fp);
|
||||
|
||||
spin_lock(&unix_gc_lock);
|
||||
|
||||
if (u) {
|
||||
if (!u->inflight) {
|
||||
BUG_ON(!list_empty(&u->link));
|
||||
list_add_tail(&u->link, &gc_inflight_list);
|
||||
} else {
|
||||
BUG_ON(list_empty(&u->link));
|
||||
}
|
||||
u->inflight++;
|
||||
/* Paired with READ_ONCE() in wait_for_unix_gc() */
|
||||
WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1);
|
||||
}
|
||||
WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1);
|
||||
spin_unlock(&unix_gc_lock);
|
||||
}
|
||||
|
||||
void unix_notinflight(struct user_struct *user, struct file *fp)
|
||||
{
|
||||
struct unix_sock *u = unix_get_socket(fp);
|
||||
|
||||
spin_lock(&unix_gc_lock);
|
||||
|
||||
if (u) {
|
||||
BUG_ON(!u->inflight);
|
||||
BUG_ON(list_empty(&u->link));
|
||||
|
||||
u->inflight--;
|
||||
if (!u->inflight)
|
||||
list_del_init(&u->link);
|
||||
/* Paired with READ_ONCE() in wait_for_unix_gc() */
|
||||
WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1);
|
||||
}
|
||||
WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1);
|
||||
spin_unlock(&unix_gc_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* The "user->unix_inflight" variable is protected by the garbage
|
||||
* collection lock, and we just read it locklessly here. If you go
|
||||
* over the limit, there might be a tiny race in actually noticing
|
||||
* it across threads. Tough.
|
||||
*/
|
||||
static inline bool too_many_unix_fds(struct task_struct *p)
|
||||
{
|
||||
struct user_struct *user = current_user();
|
||||
|
||||
if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
|
||||
return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
|
||||
return false;
|
||||
}
|
||||
|
||||
int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (too_many_unix_fds(current))
|
||||
return -ETOOMANYREFS;
|
||||
|
||||
/*
|
||||
* Need to duplicate file references for the sake of garbage
|
||||
* collection. Otherwise a socket in the fps might become a
|
||||
* candidate for GC while the skb is not yet queued.
|
||||
*/
|
||||
UNIXCB(skb).fp = scm_fp_dup(scm->fp);
|
||||
if (!UNIXCB(skb).fp)
|
||||
return -ENOMEM;
|
||||
|
||||
for (i = scm->fp->count - 1; i >= 0; i--)
|
||||
unix_inflight(scm->fp->user, scm->fp->fp[i]);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(unix_attach_fds);
|
||||
|
||||
void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
|
||||
{
|
||||
int i;
|
||||
|
||||
scm->fp = UNIXCB(skb).fp;
|
||||
UNIXCB(skb).fp = NULL;
|
||||
|
||||
for (i = scm->fp->count-1; i >= 0; i--)
|
||||
unix_notinflight(scm->fp->user, scm->fp->fp[i]);
|
||||
}
|
||||
EXPORT_SYMBOL(unix_detach_fds);
|
||||
|
||||
void unix_destruct_scm(struct sk_buff *skb)
|
||||
{
|
||||
struct scm_cookie scm;
|
||||
|
||||
memset(&scm, 0, sizeof(scm));
|
||||
scm.pid = UNIXCB(skb).pid;
|
||||
if (UNIXCB(skb).fp)
|
||||
unix_detach_fds(&scm, skb);
|
||||
|
||||
/* Alas, it calls VFS */
|
||||
/* So fscking what? fput() had been SMP-safe since the last Summer */
|
||||
scm_destroy(&scm);
|
||||
sock_wfree(skb);
|
||||
}
|
||||
EXPORT_SYMBOL(unix_destruct_scm);
|
||||
|
||||
void io_uring_destruct_scm(struct sk_buff *skb)
|
||||
{
|
||||
unix_destruct_scm(skb);
|
||||
}
|
||||
EXPORT_SYMBOL(io_uring_destruct_scm);
|
@@ -1,10 +0,0 @@
|
||||
#ifndef NET_UNIX_SCM_H
|
||||
#define NET_UNIX_SCM_H
|
||||
|
||||
extern struct list_head gc_inflight_list;
|
||||
extern spinlock_t unix_gc_lock;
|
||||
|
||||
int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb);
|
||||
void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb);
|
||||
|
||||
#endif
|
@@ -105,7 +105,8 @@ static int __init sample_trace_array_init(void)
|
||||
* NOTE: This function increments the reference counter
|
||||
* associated with the trace array - "tr".
|
||||
*/
|
||||
tr = trace_array_get_by_name("sample-instance");
|
||||
tr = trace_array_get_by_name_ext("sample-instance",
|
||||
"sched,timer,kprobes");
|
||||
|
||||
if (!tr)
|
||||
return -1;
|
||||
|
@@ -88,6 +88,76 @@ static void write_debugfs(const char *fmt, ...)
|
||||
}
|
||||
}
|
||||
|
||||
static char *allocate_zero_filled_hugepage(size_t len)
|
||||
{
|
||||
char *result;
|
||||
size_t i;
|
||||
|
||||
result = memalign(pmd_pagesize, len);
|
||||
if (!result) {
|
||||
printf("Fail to allocate memory\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
madvise(result, len, MADV_HUGEPAGE);
|
||||
|
||||
for (i = 0; i < len; i++)
|
||||
result[i] = (char)0;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static void verify_rss_anon_split_huge_page_all_zeroes(char *one_page, int nr_hpages, size_t len)
|
||||
{
|
||||
unsigned long rss_anon_before, rss_anon_after;
|
||||
size_t i;
|
||||
|
||||
if (!check_huge_anon(one_page, 4, pmd_pagesize)) {
|
||||
printf("No THP is allocated\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
rss_anon_before = rss_anon();
|
||||
if (!rss_anon_before) {
|
||||
printf("No RssAnon is allocated before split\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
/* split all THPs */
|
||||
write_debugfs(PID_FMT, getpid(), (uint64_t)one_page,
|
||||
(uint64_t)one_page + len, 0);
|
||||
|
||||
for (i = 0; i < len; i++)
|
||||
if (one_page[i] != (char)0) {
|
||||
printf("%ld byte corrupted\n", i);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
if (!check_huge_anon(one_page, 0, pmd_pagesize)) {
|
||||
printf("Still AnonHugePages not split\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
rss_anon_after = rss_anon();
|
||||
if (rss_anon_after >= rss_anon_before) {
|
||||
printf("Incorrect RssAnon value. Before: %ld After: %ld\n",
|
||||
rss_anon_before, rss_anon_after);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
void split_pmd_zero_pages(void)
|
||||
{
|
||||
char *one_page;
|
||||
int nr_hpages = 4;
|
||||
size_t len = nr_hpages * pmd_pagesize;
|
||||
|
||||
one_page = allocate_zero_filled_hugepage(len);
|
||||
verify_rss_anon_split_huge_page_all_zeroes(one_page, nr_hpages, len);
|
||||
printf("Split zero filled huge pages successful\n");
|
||||
free(one_page);
|
||||
}
|
||||
|
||||
void split_pmd_thp(void)
|
||||
{
|
||||
char *one_page;
|
||||
@@ -305,6 +375,7 @@ int main(int argc, char **argv)
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
split_pmd_zero_pages();
|
||||
split_pmd_thp();
|
||||
split_pte_mapped_thp();
|
||||
split_file_backed_thp();
|
||||
|
@@ -11,6 +11,7 @@
|
||||
|
||||
#define PMD_SIZE_FILE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
|
||||
#define SMAP_FILE_PATH "/proc/self/smaps"
|
||||
#define STATUS_FILE_PATH "/proc/self/status"
|
||||
#define MAX_LINE_LENGTH 500
|
||||
|
||||
unsigned int __page_size;
|
||||
@@ -97,6 +98,27 @@ uint64_t read_pmd_pagesize(void)
|
||||
return strtoul(buf, NULL, 10);
|
||||
}
|
||||
|
||||
unsigned long rss_anon(void)
|
||||
{
|
||||
unsigned long rss_anon = 0;
|
||||
FILE *fp;
|
||||
char buffer[MAX_LINE_LENGTH];
|
||||
|
||||
fp = fopen(STATUS_FILE_PATH, "r");
|
||||
if (!fp)
|
||||
ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, STATUS_FILE_PATH);
|
||||
|
||||
if (!check_for_pattern(fp, "RssAnon:", buffer, sizeof(buffer)))
|
||||
goto err_out;
|
||||
|
||||
if (sscanf(buffer, "RssAnon:%10lu kB", &rss_anon) != 1)
|
||||
ksft_exit_fail_msg("Reading status error\n");
|
||||
|
||||
err_out:
|
||||
fclose(fp);
|
||||
return rss_anon;
|
||||
}
|
||||
|
||||
bool __check_huge(void *addr, char *pattern, int nr_hpages,
|
||||
uint64_t hpage_size)
|
||||
{
|
||||
|
@@ -39,6 +39,7 @@ unsigned long pagemap_get_pfn(int fd, char *start);
|
||||
void clear_softdirty(void);
|
||||
bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len);
|
||||
uint64_t read_pmd_pagesize(void);
|
||||
unsigned long rss_anon(void);
|
||||
bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size);
|
||||
bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size);
|
||||
bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size);
|
||||
|
Reference in New Issue
Block a user