diff --git a/BUILD.bazel b/BUILD.bazel index dbb598ac61ae..4be135108507 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -126,6 +126,7 @@ filegroup( "android/abi_gki_aarch64_mtk", "android/abi_gki_aarch64_mtktv", "android/abi_gki_aarch64_nothing", + "android/abi_gki_aarch64_nvidia", "android/abi_gki_aarch64_oplus", "android/abi_gki_aarch64_paragon", "android/abi_gki_aarch64_pixel", @@ -140,7 +141,7 @@ filegroup( "android/abi_gki_aarch64_virtual_device", "android/abi_gki_aarch64_vivo", "android/abi_gki_aarch64_xiaomi", - "android/abi_gki_aarch64_xiaomi2", + "android/abi_gki_aarch64_xiaomi_xring", ], visibility = ["//visibility:public"], ) @@ -1028,6 +1029,9 @@ ddk_headers( "drivers/pci/controller/dwc/pcie-designware.h", "drivers/thermal/thermal_core.h", "drivers/thermal/thermal_netlink.h", + "drivers/ufs/core/ufshcd-crypto.h", + "drivers/ufs/core/ufshcd-priv.h", + "drivers/ufs/host/ufshcd-pltfrm.h", "drivers/usb/dwc3/core.h", "sound/usb/card.h", "sound/usb/usbaudio.h", @@ -1045,6 +1049,7 @@ ddk_headers( "drivers/extcon", "drivers/pci/controller/dwc", "drivers/thermal", + "drivers/ufs", "drivers/usb", "sound/usb", "include", diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 34e56da8f55d..7cebb21b7b16 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -202,6 +202,16 @@ PMD-mappable transparent hugepage:: cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size +All THPs at fault and collapse time will be added to _deferred_list, +and will therefore be split under memory presure if they are considered +"underused". A THP is underused if the number of zero-filled pages in +the THP is above max_ptes_none (see below). It is possible to disable +this behaviour by writing 0 to shrink_underused, and enable it by writing +1 to it:: + + echo 0 > /sys/kernel/mm/transparent_hugepage/shrink_underused + echo 1 > /sys/kernel/mm/transparent_hugepage/shrink_underused + khugepaged will be automatically started when one or more hugepage sizes are enabled (either by directly setting "always" or "madvise", or by setting "inherit" while the top-level enabled is set to "always" @@ -443,6 +453,12 @@ thp_deferred_split_page splitting it would free up some memory. Pages on split queue are going to be split under memory pressure. +thp_underused_split_page + is incremented when a huge page on the split queue was split + because it was underused. A THP is underused if the number of + zero pages in the THP is above a certain threshold + (/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none). + thp_split_pmd is incremented every time a PMD split into table of PTEs. This can happen, for instance, when application calls mprotect() or @@ -510,6 +526,18 @@ split_deferred it would free up some memory. Pages on split queue are going to be split under memory pressure, if splitting is possible. +nr_anon + the number of anonymous THP we have in the whole system. These THPs + might be currently entirely mapped or have partially unmapped/unused + subpages. + +nr_anon_partially_mapped + the number of anonymous THP which are likely partially mapped, possibly + wasting memory, and have been queued for deferred memory reclamation. + Note that in corner some cases (e.g., failed migration), we might detect + an anonymous THP as "partially mapped" and count it here, even though it + is not actually partially mapped anymore. + As the system ages, allocating huge pages may be expensive as the system uses memory compaction to copy data around memory to free a huge page for use. There are some counters in ``/proc/vmstat`` to help diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg index 52b09adda2aa..0de7176f5e86 100644 --- a/android/abi_gki_aarch64.stg +++ b/android/abi_gki_aarch64.stg @@ -2478,6 +2478,11 @@ pointer_reference { kind: POINTER pointee_type_id: 0x082ccdd1 } +pointer_reference { + id: 0x08a51ea1 + kind: POINTER + pointee_type_id: 0x08d49c19 +} pointer_reference { id: 0x08a8dfa4 kind: POINTER @@ -3073,6 +3078,11 @@ pointer_reference { kind: POINTER pointee_type_id: 0x01d3d77e } +pointer_reference { + id: 0x0aeaa065 + kind: POINTER + pointee_type_id: 0x01ea670b +} pointer_reference { id: 0x0aee7ba0 kind: POINTER @@ -11518,6 +11528,11 @@ pointer_reference { kind: POINTER pointee_type_id: 0x5d82e969 } +pointer_reference { + id: 0x1df3293f + kind: POINTER + pointee_type_id: 0x5d8c4261 +} pointer_reference { id: 0x1df3d475 kind: POINTER @@ -12243,6 +12258,11 @@ pointer_reference { kind: POINTER pointee_type_id: 0xa8ae6b7a } +pointer_reference { + id: 0x20c77d4a + kind: POINTER + pointee_type_id: 0xa95d13b4 +} pointer_reference { id: 0x20d23755 kind: POINTER @@ -12453,6 +12473,11 @@ pointer_reference { kind: POINTER pointee_type_id: 0xaf33de29 } +pointer_reference { + id: 0x2163ef30 + kind: POINTER + pointee_type_id: 0xafcf5a5c +} pointer_reference { id: 0x2170d06d kind: POINTER @@ -24228,6 +24253,11 @@ pointer_reference { kind: POINTER pointee_type_id: 0xe9917f17 } +pointer_reference { + id: 0x30f6da87 + kind: POINTER + pointee_type_id: 0xe99b8c83 +} pointer_reference { id: 0x30fbef83 kind: POINTER @@ -29048,6 +29078,11 @@ pointer_reference { kind: POINTER pointee_type_id: 0xc0009f60 } +pointer_reference { + id: 0x3a91c7e7 + kind: POINTER + pointee_type_id: 0xc007f900 +} pointer_reference { id: 0x3a91ca5f kind: POINTER @@ -34038,6 +34073,11 @@ typedef { name: "te1_settings" referred_type_id: 0x247e8a06 } +typedef { + id: 0x8d4bfd24 + name: "tegra_bpmp_mrq_handler_t" + referred_type_id: 0x0aeaa065 +} typedef { id: 0x865acc96 name: "time64_t" @@ -36933,6 +36973,11 @@ qualified { qualifier: CONST qualified_type_id: 0xd659846e } +qualified { + id: 0xe99b8c83 + qualifier: CONST + qualified_type_id: 0xd6298487 +} qualified { id: 0xe9af5892 qualifier: CONST @@ -38511,6 +38556,11 @@ array { number_of_elements: 3 element_type_id: 0x02cb01c6 } +array { + id: 0x1359d776 + number_of_elements: 108 + element_type_id: 0x384f7d7c +} array { id: 0x13735712 number_of_elements: 20 @@ -41932,6 +41982,10 @@ array { id: 0xfca4258b element_type_id: 0xe276adef } +array { + id: 0xfcb1c304 + element_type_id: 0xe22137d3 +} array { id: 0xfd17183f element_type_id: 0xe4ba5b3d @@ -42469,6 +42523,10 @@ member { id: 0x2c96db22 type_id: 0x3d751c99 } +member { + id: 0x2cd6acbf + type_id: 0x3c74c2ee +} member { id: 0x2d16b3a0 type_id: 0x3b74be91 @@ -42939,8 +42997,8 @@ member { offset: 704 } member { - id: 0x335fcb96 - type_id: 0x42514dcd + id: 0x335fcf90 + type_id: 0x42515dd6 offset: 64 } member { @@ -43414,6 +43472,11 @@ member { type_id: 0x57bf00b8 offset: 576 } +member { + id: 0x3643ba2c + type_id: 0x56209a0e + offset: 256 +} member { id: 0x36477112 type_id: 0x5633b45b @@ -45547,6 +45610,11 @@ member { offset: 242 bitsize: 14 } +member { + id: 0x906218d7 + name: "__unused" + type_id: 0xa179a8c5 +} member { id: 0x9086e58b name: "__unused" @@ -48044,6 +48112,12 @@ member { name: "addr" type_id: 0x33756485 } +member { + id: 0x24257f11 + name: "addr" + type_id: 0x08a51ea1 + offset: 6848 +} member { id: 0x2425a5c3 name: "addr" @@ -57911,6 +57985,12 @@ member { type_id: 0x120540d1 offset: 4480 } +member { + id: 0xd38b8035 + name: "bindlock" + type_id: 0xa7c362b0 + offset: 7424 +} member { id: 0x975a5c19 name: "bindv6only" @@ -71030,6 +71110,12 @@ member { name: "count_objects" type_id: 0x0484940b } +member { + id: 0x11d6bda9 + name: "count_unix" + type_id: 0xb0312d5a + offset: 16576 +} member { id: 0x20014498 name: "counter" @@ -75156,6 +75242,11 @@ member { type_id: 0x391f15ea offset: 384 } +member { + id: 0xffab3b79 + name: "data" + type_id: 0x391f15ea +} member { id: 0xffab3bd2 name: "data" @@ -76287,6 +76378,12 @@ member { type_id: 0x6d7f5ff6 offset: 3200 } +member { + id: 0x3bde3edb + name: "dead" + type_id: 0x6d7f5ff6 + offset: 16328 +} member { id: 0x6491a5c4 name: "dead" @@ -86100,12 +86197,23 @@ member { type_id: 0xf313e71a offset: 33280 } +member { + id: 0x76197573 + name: "edges" + type_id: 0xd3c80119 +} member { id: 0x7692e117 name: "edges" type_id: 0x585b40a5 offset: 1728 } +member { + id: 0x76d74733 + name: "edges" + type_id: 0x1df3293f + offset: 16512 +} member { id: 0x31c31f3f name: "edid" @@ -112244,6 +112352,12 @@ member { type_id: 0x33756485 offset: 64 } +member { + id: 0xad0a7aac + name: "index" + type_id: 0x33756485 + offset: 448 +} member { id: 0xad0a7d70 name: "index" @@ -112651,6 +112765,12 @@ member { type_id: 0xd3c80119 offset: 2752 } +member { + id: 0x799ec517 + name: "inflight" + type_id: 0x6d7f5ff6 + offset: 16320 +} member { id: 0x79f74293 name: "inflight" @@ -115687,6 +115807,12 @@ member { type_id: 0xd3c80119 offset: 20288 } +member { + id: 0x3eb3deac + name: "iolock" + type_id: 0xa7c362b0 + offset: 7040 +} member { id: 0x5131ffa2 name: "iomap" @@ -124200,6 +124326,12 @@ member { type_id: 0x2c912323 offset: 768 } +member { + id: 0xe2a2a0b6 + name: "listener" + type_id: 0x1d44326e + offset: 7936 +} member { id: 0xe2a2a2d7 name: "listener" @@ -124866,6 +124998,12 @@ member { name: "lock" type_id: 0xf4933b90 } +member { + id: 0x2d1fe06a + name: "lock" + type_id: 0xf313e71a + offset: 8000 +} member { id: 0x2d1fe165 name: "lock" @@ -136887,6 +137025,11 @@ member { type_id: 0xebbaa9d5 offset: 800 } +member { + id: 0x630c78f4 + name: "mrq" + type_id: 0x4585663f +} member { id: 0x6360d0b2 name: "mrq" @@ -138302,6 +138445,12 @@ member { type_id: 0xfc59be2e offset: 128 } +member { + id: 0x0d27d97f + name: "name" + type_id: 0xfcb1c304 + offset: 64 +} member { id: 0x0d369e2a name: "name" @@ -143865,6 +144014,11 @@ member { type_id: 0x92233392 offset: 1024 } +member { + id: 0xc7d661bf + name: "nr_fds" + type_id: 0x74d29cf1 +} member { id: 0xc661bf13 name: "nr_file_filters" @@ -144442,6 +144596,12 @@ member { type_id: 0x4585663f offset: 21760 } +member { + id: 0xb206d8fd + name: "nr_unix_fds" + type_id: 0x33756485 + offset: 64 +} member { id: 0xfb6f3a99 name: "nr_user" @@ -148451,6 +148611,12 @@ member { type_id: 0x0dad77a5 offset: 896 } +member { + id: 0x337b49fc + name: "oob_skb" + type_id: 0x054f691a + offset: 9152 +} member { id: 0x97510416 name: "oom_flag_origin" @@ -150480,6 +150646,12 @@ member { type_id: 0x34d79405 offset: 128 } +member { + id: 0x0fc2a957 + name: "out_degree" + type_id: 0x33756485 + offset: 384 +} member { id: 0xbdac191c name: "out_ep" @@ -153382,6 +153554,12 @@ member { type_id: 0x5adf57ae offset: 256 } +member { + id: 0x77353e9c + name: "path" + type_id: 0x71a68091 + offset: 6912 +} member { id: 0x776fe9d5 name: "path" @@ -154380,6 +154558,12 @@ member { name: "peer" type_id: 0x0258f96e } +member { + id: 0xb794a532 + name: "peer" + type_id: 0x1d44326e + offset: 7808 +} member { id: 0x3be8531a name: "peer2peer" @@ -154414,6 +154598,18 @@ member { type_id: 0x7584e7da offset: 304 } +member { + id: 0x6a6789c8 + name: "peer_wake" + type_id: 0x347ff86e + offset: 8704 +} +member { + id: 0x337f3641 + name: "peer_wq" + type_id: 0x5eee2044 + offset: 8192 +} member { id: 0xc29a8766 name: "peers" @@ -156569,10 +156765,9 @@ member { offset: 576 } member { - id: 0x03347550 + id: 0x0345ffe6 name: "pinned_pages" - type_id: 0xa179a8c5 - offset: 256 + type_id: 0xd0f3b5bf } member { id: 0x88a7076f @@ -160101,6 +160296,11 @@ member { type_id: 0x6423b0d9 offset: 608 } +member { + id: 0x5ba539f6 + name: "predecessor" + type_id: 0x3a91c7e7 +} member { id: 0x649fc3ef name: "predicted" @@ -174689,6 +174889,12 @@ member { type_id: 0x6720d32f offset: 192 } +member { + id: 0x51f16b6f + name: "ret" + type_id: 0x6720d32f + offset: 128 +} member { id: 0x51f63d7f name: "ret" @@ -177379,6 +177585,12 @@ member { type_id: 0x3b007e79 offset: 64 } +member { + id: 0x6cd7f7a7 + name: "rx" + type_id: 0x27b8946e + offset: 192 +} member { id: 0xec2f95e2 name: "rx_addr" @@ -180175,6 +180387,18 @@ member { type_id: 0x6d7f5ff6 offset: 40 } +member { + id: 0xd741c1b9 + name: "scc_entry" + type_id: 0xd3c80119 + offset: 256 +} +member { + id: 0x027dfeed + name: "scc_index" + type_id: 0x33756485 + offset: 512 +} member { id: 0x0ac848fc name: "scdc" @@ -180437,6 +180661,12 @@ member { name: "sclass" type_id: 0xc9082b19 } +member { + id: 0xac812894 + name: "scm_stat" + type_id: 0xfa80d3fa + offset: 9024 +} member { id: 0xe94f5bce name: "scmd_flags" @@ -189307,6 +189537,11 @@ member { type_id: 0x0f07cf2c offset: 1408 } +member { + id: 0x406e59a2 + name: "split" + type_id: 0x3721bb90 +} member { id: 0x406ecc6d name: "split" @@ -190613,6 +190848,12 @@ member { type_id: 0xc9082b19 offset: 224 } +member { + id: 0xc7b7491e + name: "stack_entry" + type_id: 0xd3c80119 + offset: 256 +} member { id: 0x5775af49 name: "stack_masks" @@ -194236,6 +194477,12 @@ member { type_id: 0xe62ebf07 offset: 64 } +member { + id: 0x132c9198 + name: "successor" + type_id: 0x3a91c7e7 + offset: 64 +} member { id: 0x90938126 name: "suggested_x_property" @@ -194314,6 +194561,17 @@ member { type_id: 0x2e0f9112 offset: 512 } +member { + id: 0xa2e62cd6 + name: "sun_family" + type_id: 0xe0705941 +} +member { + id: 0x7339613c + name: "sun_path" + type_id: 0x1359d776 + offset: 16 +} member { id: 0xc6d8a98e name: "super_block_align" @@ -204139,6 +204397,12 @@ member { type_id: 0x2c3ef046 offset: 128 } +member { + id: 0x65ff358a + name: "tx" + type_id: 0x27b0e88e + offset: 64 +} member { id: 0x562b2b94 name: "tx16" @@ -212437,6 +212701,18 @@ member { type_id: 0xc9082b19 offset: 8960 } +member { + id: 0x5eddc09d + name: "vertex" + type_id: 0x20c77d4a + offset: 7872 +} +member { + id: 0x957e8a2d + name: "vertex_entry" + type_id: 0xd3c80119 + offset: 128 +} member { id: 0x7f926278 name: "vertical_position" @@ -212455,6 +212731,12 @@ member { type_id: 0xe8034002 offset: 16 } +member { + id: 0x7b11d1cc + name: "vertices" + type_id: 0xd3c80119 + offset: 16384 +} member { id: 0x444fc004 name: "vet_description" @@ -222521,6 +222803,25 @@ struct_union { member_id: 0xb84054d1 } } +struct_union { + id: 0x27b0e88e + kind: STRUCT + definition { + bytesize: 16 + member_id: 0xffab3b79 + member_id: 0xd98a2dfb + } +} +struct_union { + id: 0x27b8946e + kind: STRUCT + definition { + bytesize: 24 + member_id: 0xff8a9909 + member_id: 0xd98a2dfb + member_id: 0x51f16b6f + } +} struct_union { id: 0x27bfa089 kind: STRUCT @@ -223874,6 +224175,14 @@ struct_union { member_id: 0xc101e64f } } +struct_union { + id: 0x3c74c2ee + kind: STRUCT + definition { + bytesize: 16 + member_id: 0x906218d7 + } +} struct_union { id: 0x3c9f0fa2 kind: STRUCT @@ -224285,12 +224594,13 @@ struct_union { } } struct_union { - id: 0x42514dcd + id: 0x42515dd6 kind: UNION definition { bytesize: 16 member_id: 0x52238160 member_id: 0x8dc1500c + member_id: 0x406e59a2 } } struct_union { @@ -225487,6 +225797,16 @@ struct_union { member_id: 0xdf160d99 } } +struct_union { + id: 0x56209a0e + kind: UNION + definition { + bytesize: 16 + member_id: 0x0345ffe6 + member_id: 0x2cd6acbf + member_id: 0x36752b74 + } +} struct_union { id: 0x5633b45b kind: UNION @@ -253178,7 +253498,7 @@ struct_union { definition { bytesize: 24 member_id: 0x5c7f890c - member_id: 0x335fcb96 + member_id: 0x335fcf90 } } struct_union { @@ -253468,7 +253788,7 @@ struct_union { bytesize: 64 member_id: 0xb8f5134f member_id: 0x63c436ff - member_id: 0x03347550 + member_id: 0x3643ba2c member_id: 0x0f7f629e member_id: 0x3a2d39cb } @@ -266699,11 +267019,26 @@ struct_union { kind: STRUCT name: "scm_fp_list" definition { - bytesize: 2040 + bytesize: 2080 member_id: 0x65956ee9 member_id: 0x97463852 member_id: 0x042a2402 member_id: 0x5449f846 + member_id: 0x799ec517 + member_id: 0x3bde3edb + member_id: 0x7b11d1cc + member_id: 0x76d74733 + member_id: 0x11d6bda9 + } +} +struct_union { + id: 0xfa80d3fa + kind: STRUCT + name: "scm_stat" + definition { + bytesize: 16 + member_id: 0xc7d661bf + member_id: 0xb206d8fd } } struct_union { @@ -271715,6 +272050,16 @@ struct_union { member_id: 0x98cf3099 } } +struct_union { + id: 0xe22137d3 + kind: STRUCT + name: "sockaddr_un" + definition { + bytesize: 110 + member_id: 0xa2e62cd6 + member_id: 0x7339613c + } +} struct_union { id: 0x14637955 kind: STRUCT @@ -273939,6 +274284,18 @@ struct_union { kind: STRUCT name: "tegra_bpmp_clk" } +struct_union { + id: 0xafcf5a5c + kind: STRUCT + name: "tegra_bpmp_message" + definition { + bytesize: 56 + member_id: 0x630c78f4 + member_id: 0x65ff358a + member_id: 0x6cd7f7a7 + member_id: 0x2d5bf0a8 + } +} struct_union { id: 0xec7454a4 kind: STRUCT @@ -277108,6 +277465,50 @@ struct_union { member_id: 0xb008455d } } +struct_union { + id: 0x08d49c19 + kind: STRUCT + name: "unix_address" + definition { + bytesize: 8 + member_id: 0xb7dcf8ac + member_id: 0xb862f8be + member_id: 0x0d27d97f + } +} +struct_union { + id: 0x5d8c4261 + kind: STRUCT + name: "unix_edge" + definition { + bytesize: 48 + member_id: 0x5ba539f6 + member_id: 0x132c9198 + member_id: 0x957e8a2d + member_id: 0xc7b7491e + } +} +struct_union { + id: 0xc007f900 + kind: STRUCT + name: "unix_sock" + definition { + bytesize: 1152 + member_id: 0x82ce9da8 + member_id: 0x24257f11 + member_id: 0x77353e9c + member_id: 0x3eb3deac + member_id: 0xd38b8035 + member_id: 0xb794a532 + member_id: 0x5eddc09d + member_id: 0xe2a2a0b6 + member_id: 0x2d1fe06a + member_id: 0x337f3641 + member_id: 0x6a6789c8 + member_id: 0xac812894 + member_id: 0x337b49fc + } +} struct_union { id: 0x3f8f92cb kind: STRUCT @@ -277118,6 +277519,20 @@ struct_union { member_id: 0xff1cd5bf } } +struct_union { + id: 0xa95d13b4 + kind: STRUCT + name: "unix_vertex" + definition { + bytesize: 72 + member_id: 0x76197573 + member_id: 0x4d8789fe + member_id: 0xd741c1b9 + member_id: 0x0fc2a957 + member_id: 0xad0a7aac + member_id: 0x027dfeed + } +} struct_union { id: 0xad1e2d0f kind: STRUCT @@ -304536,6 +304951,13 @@ function { parameter_id: 0x4585663f parameter_id: 0x07944f4b } +function { + id: 0x01ea670b + return_type_id: 0x48b5725f + parameter_id: 0x4585663f + parameter_id: 0x05501385 + parameter_id: 0x18bd6530 +} function { id: 0x01ecbbc0 return_type_id: 0x48b5725f @@ -305659,6 +306081,14 @@ function { parameter_id: 0x4585663f parameter_id: 0x0ee0d5d1 } +function { + id: 0x105d412e + return_type_id: 0x48b5725f + parameter_id: 0x05501385 + parameter_id: 0x6720d32f + parameter_id: 0x391f15ea + parameter_id: 0xf435685e +} function { id: 0x105fb374 return_type_id: 0xd5cc9c9a @@ -312446,6 +312876,13 @@ function { parameter_id: 0x25520d15 parameter_id: 0x026c3dea } +function { + id: 0x19c5ab78 + return_type_id: 0x48b5725f + parameter_id: 0x213700a8 + parameter_id: 0x4585663f + parameter_id: 0x18bd6530 +} function { id: 0x19c6594b return_type_id: 0x48b5725f @@ -316644,6 +317081,11 @@ function { parameter_id: 0x6720d32f parameter_id: 0x6d7f5ff6 } +function { + id: 0x1f202302 + return_type_id: 0x48b5725f + parameter_id: 0x3ee1ca44 +} function { id: 0x1f21f887 return_type_id: 0xd5cc9c9a @@ -317290,6 +317732,12 @@ function { parameter_id: 0x3ba261b0 parameter_id: 0x6d7f5ff6 } +function { + id: 0x1fc56d5c + return_type_id: 0x07f2af49 + parameter_id: 0x3e10b518 + parameter_id: 0x3e10b518 +} function { id: 0x1fc886d7 return_type_id: 0x48b5725f @@ -324821,6 +325269,13 @@ function { return_type_id: 0x6720d32f parameter_id: 0x37cc6bab } +function { + id: 0x907748a2 + return_type_id: 0x6720d32f + parameter_id: 0x347303b4 + parameter_id: 0x3aff5796 + parameter_id: 0x0258f96e +} function { id: 0x9077fd27 return_type_id: 0x6720d32f @@ -327883,6 +328338,11 @@ function { parameter_id: 0x3806390a parameter_id: 0x6d7f5ff6 } +function { + id: 0x923891be + return_type_id: 0x6720d32f + parameter_id: 0x3ee1ca44 +} function { id: 0x923af25c return_type_id: 0x6720d32f @@ -327903,6 +328363,20 @@ function { return_type_id: 0x6720d32f parameter_id: 0x3ef6d4b8 } +function { + id: 0x923e40fa + return_type_id: 0x6720d32f + parameter_id: 0x3ee1ca44 + parameter_id: 0x0258f96e + parameter_id: 0x30f6da87 + parameter_id: 0xe02e14d6 + parameter_id: 0x30f6da87 + parameter_id: 0xe02e14d6 + parameter_id: 0x4585663f + parameter_id: 0xf435685e + parameter_id: 0x0d408c02 + parameter_id: 0x18bd6530 +} function { id: 0x923e6d3f return_type_id: 0x6720d32f @@ -328734,6 +329208,12 @@ function { parameter_id: 0x31c8b544 parameter_id: 0xcd84e3cd } +function { + id: 0x92c4f8dc + return_type_id: 0x6720d32f + parameter_id: 0x3ee1ca44 + parameter_id: 0x3f1a5886 +} function { id: 0x92c581e2 return_type_id: 0x6720d32f @@ -331159,6 +331639,14 @@ function { return_type_id: 0x6720d32f parameter_id: 0x25db57c0 } +function { + id: 0x94f80697 + return_type_id: 0x6720d32f + parameter_id: 0x213700a8 + parameter_id: 0x4585663f + parameter_id: 0x8d4bfd24 + parameter_id: 0x18bd6530 +} function { id: 0x94f8a6cc return_type_id: 0x6720d32f @@ -331387,6 +331875,12 @@ function { parameter_id: 0x27459bea parameter_id: 0x4585663f } +function { + id: 0x9548acb9 + return_type_id: 0x6720d32f + parameter_id: 0x213700a8 + parameter_id: 0x2163ef30 +} function { id: 0x954a5c30 return_type_id: 0x6720d32f @@ -334120,6 +334614,11 @@ function { parameter_id: 0x27a7c613 parameter_id: 0x2cacacc0 } +function { + id: 0x98764c5c + return_type_id: 0x6720d32f + parameter_id: 0x17dabdcd +} function { id: 0x98773f07 return_type_id: 0x6720d32f @@ -354345,6 +354844,12 @@ function { parameter_id: 0x4faa9b63 parameter_id: 0x1856a912 } +function { + id: 0xf7ff8eb6 + return_type_id: 0x6d7f5ff6 + parameter_id: 0x213700a8 + parameter_id: 0x4585663f +} function { id: 0xf806feb1 return_type_id: 0x6d7f5ff6 @@ -362813,6 +363318,15 @@ elf_symbol { type_id: 0x9bb62df9 full_name: "__traceiter_android_vh_meminfo_proc_show" } +elf_symbol { + id: 0xe02fe9e3 + name: "__traceiter_android_vh_mempool_alloc_skip_wait" + is_defined: true + symbol_type: FUNCTION + crc: 0xabe64979 + type_id: 0x9b702fa1 + full_name: "__traceiter_android_vh_mempool_alloc_skip_wait" +} elf_symbol { id: 0x105350cb name: "__traceiter_android_vh_mglru_should_abort_scan" @@ -362876,6 +363390,15 @@ elf_symbol { type_id: 0x9b7ba7c5 full_name: "__traceiter_android_vh_mm_direct_reclaim_exit" } +elf_symbol { + id: 0x3fe16974 + name: "__traceiter_android_vh_mm_free_page" + is_defined: true + symbol_type: FUNCTION + crc: 0xe8bcf150 + type_id: 0x9bb5b719 + full_name: "__traceiter_android_vh_mm_free_page" +} elf_symbol { id: 0x6f5c8275 name: "__traceiter_android_vh_mm_kcompactd_cpu_online" @@ -365099,6 +365622,15 @@ elf_symbol { type_id: 0x9bd88b02 full_name: "__traceiter_android_vh_wq_wake_idle_worker" } +elf_symbol { + id: 0xd2ca02c1 + name: "__traceiter_android_vh_xhci_full_reset_on_remove" + is_defined: true + symbol_type: FUNCTION + crc: 0x21b6ef65 + type_id: 0x9be885da + full_name: "__traceiter_android_vh_xhci_full_reset_on_remove" +} elf_symbol { id: 0x3a6c45d6 name: "__traceiter_android_vh_xhci_resume" @@ -369878,6 +370410,15 @@ elf_symbol { type_id: 0x18ccbd2c full_name: "__tracepoint_android_vh_meminfo_proc_show" } +elf_symbol { + id: 0x5e49798d + name: "__tracepoint_android_vh_mempool_alloc_skip_wait" + is_defined: true + symbol_type: OBJECT + crc: 0x2317dfbe + type_id: 0x18ccbd2c + full_name: "__tracepoint_android_vh_mempool_alloc_skip_wait" +} elf_symbol { id: 0xca4f3601 name: "__tracepoint_android_vh_mglru_should_abort_scan" @@ -369941,6 +370482,15 @@ elf_symbol { type_id: 0x18ccbd2c full_name: "__tracepoint_android_vh_mm_direct_reclaim_exit" } +elf_symbol { + id: 0x533ca98e + name: "__tracepoint_android_vh_mm_free_page" + is_defined: true + symbol_type: OBJECT + crc: 0x9efc24a8 + type_id: 0x18ccbd2c + full_name: "__tracepoint_android_vh_mm_free_page" +} elf_symbol { id: 0x0f593caf name: "__tracepoint_android_vh_mm_kcompactd_cpu_online" @@ -372164,6 +372714,15 @@ elf_symbol { type_id: 0x18ccbd2c full_name: "__tracepoint_android_vh_wq_wake_idle_worker" } +elf_symbol { + id: 0x38d95973 + name: "__tracepoint_android_vh_xhci_full_reset_on_remove" + is_defined: true + symbol_type: OBJECT + crc: 0x8a370dcb + type_id: 0x18ccbd2c + full_name: "__tracepoint_android_vh_xhci_full_reset_on_remove" +} elf_symbol { id: 0x6cb1a35c name: "__tracepoint_android_vh_xhci_resume" @@ -375655,6 +376214,15 @@ elf_symbol { type_id: 0xfcb31d8f full_name: "blk_mq_sched_try_merge" } +elf_symbol { + id: 0xd28f8ce5 + name: "blk_mq_start_hw_queues" + is_defined: true + symbol_type: FUNCTION + crc: 0xe5b16473 + type_id: 0x12c8ce83 + full_name: "blk_mq_start_hw_queues" +} elf_symbol { id: 0x96f2cda8 name: "blk_mq_start_request" @@ -375682,6 +376250,15 @@ elf_symbol { type_id: 0x181a6fb5 full_name: "blk_mq_stop_hw_queue" } +elf_symbol { + id: 0xd13a60aa + name: "blk_mq_stop_hw_queues" + is_defined: true + symbol_type: FUNCTION + crc: 0x27dd6377 + type_id: 0x12c8ce83 + full_name: "blk_mq_stop_hw_queues" +} elf_symbol { id: 0x37d3a115 name: "blk_mq_tagset_busy_iter" @@ -407836,6 +408413,15 @@ elf_symbol { type_id: 0x119fef8e full_name: "of_phy_simple_xlate" } +elf_symbol { + id: 0x9c2bfa6b + name: "of_platform_default_populate" + is_defined: true + symbol_type: FUNCTION + crc: 0xfcdc9248 + type_id: 0x907748a2 + full_name: "of_platform_default_populate" +} elf_symbol { id: 0x840bde05 name: "of_platform_depopulate" @@ -417418,6 +418004,15 @@ elf_symbol { type_id: 0x0ff8d13d full_name: "scsi_host_alloc" } +elf_symbol { + id: 0xfaba855b + name: "scsi_host_busy" + is_defined: true + symbol_type: FUNCTION + crc: 0x9665b881 + type_id: 0x98764c5c + full_name: "scsi_host_busy" +} elf_symbol { id: 0x022517f0 name: "scsi_host_lookup" @@ -425356,6 +425951,123 @@ elf_symbol { type_id: 0x1f174592 full_name: "tcpm_vbus_change" } +elf_symbol { + id: 0xe54ea1f3 + name: "tegra_bpmp_free_mrq" + is_defined: true + symbol_type: FUNCTION + crc: 0xaf6bcfea + type_id: 0x19c5ab78 + full_name: "tegra_bpmp_free_mrq" +} +elf_symbol { + id: 0xa2565005 + name: "tegra_bpmp_mrq_is_supported" + is_defined: true + symbol_type: FUNCTION + crc: 0xc6b176d9 + type_id: 0xf7ff8eb6 + full_name: "tegra_bpmp_mrq_is_supported" +} +elf_symbol { + id: 0xfc2dbec4 + name: "tegra_bpmp_mrq_return" + is_defined: true + symbol_type: FUNCTION + crc: 0x223cf89d + type_id: 0x105d412e + full_name: "tegra_bpmp_mrq_return" +} +elf_symbol { + id: 0x08907db0 + name: "tegra_bpmp_request_mrq" + is_defined: true + symbol_type: FUNCTION + crc: 0x4bd2db5c + type_id: 0x94f80697 + full_name: "tegra_bpmp_request_mrq" +} +elf_symbol { + id: 0x81a07067 + name: "tegra_bpmp_transfer" + is_defined: true + symbol_type: FUNCTION + crc: 0xf29da5b1 + type_id: 0x9548acb9 + full_name: "tegra_bpmp_transfer" +} +elf_symbol { + id: 0xb47ee8bd + name: "tegra_bpmp_transfer_atomic" + is_defined: true + symbol_type: FUNCTION + crc: 0x6737b43c + type_id: 0x9548acb9 + full_name: "tegra_bpmp_transfer_atomic" +} +elf_symbol { + id: 0x07f159e7 + name: "tegra_ivc_init" + is_defined: true + symbol_type: FUNCTION + crc: 0x147021c7 + type_id: 0x923e40fa + full_name: "tegra_ivc_init" +} +elf_symbol { + id: 0x94af2b02 + name: "tegra_ivc_notified" + is_defined: true + symbol_type: FUNCTION + crc: 0xf425fd43 + type_id: 0x923891be + full_name: "tegra_ivc_notified" +} +elf_symbol { + id: 0xa7d44351 + name: "tegra_ivc_read_advance" + is_defined: true + symbol_type: FUNCTION + crc: 0xd971c967 + type_id: 0x923891be + full_name: "tegra_ivc_read_advance" +} +elf_symbol { + id: 0x8e08a107 + name: "tegra_ivc_read_get_next_frame" + is_defined: true + symbol_type: FUNCTION + crc: 0x5f8dbfc9 + type_id: 0x92c4f8dc + full_name: "tegra_ivc_read_get_next_frame" +} +elf_symbol { + id: 0xe19aecef + name: "tegra_ivc_reset" + is_defined: true + symbol_type: FUNCTION + crc: 0x063d20ed + type_id: 0x1f202302 + full_name: "tegra_ivc_reset" +} +elf_symbol { + id: 0xc746a415 + name: "tegra_ivc_write_advance" + is_defined: true + symbol_type: FUNCTION + crc: 0x35f1975b + type_id: 0x923891be + full_name: "tegra_ivc_write_advance" +} +elf_symbol { + id: 0x175273e3 + name: "tegra_ivc_write_get_next_frame" + is_defined: true + symbol_type: FUNCTION + crc: 0xd843aff5 + type_id: 0x92c4f8dc + full_name: "tegra_ivc_write_get_next_frame" +} elf_symbol { id: 0xa8f0fe44 name: "tegra_mc_probe_device" @@ -425817,6 +426529,15 @@ elf_symbol { type_id: 0x1f3d2f88 full_name: "trace_array_get_by_name" } +elf_symbol { + id: 0x21d9778a + name: "trace_array_get_by_name_ext" + is_defined: true + symbol_type: FUNCTION + crc: 0x11bbc6d1 + type_id: 0x1fc56d5c + full_name: "trace_array_get_by_name_ext" +} elf_symbol { id: 0x9fa2aa18 name: "trace_array_init_printk" @@ -436482,6 +437203,7 @@ interface { symbol_id: 0x3431d426 symbol_id: 0x8c174a7d symbol_id: 0xea8ce2b0 + symbol_id: 0xe02fe9e3 symbol_id: 0x105350cb symbol_id: 0x83742db6 symbol_id: 0xf8413699 @@ -436489,6 +437211,7 @@ interface { symbol_id: 0xf928bf8a symbol_id: 0x9f58159a symbol_id: 0x29c67d40 + symbol_id: 0x3fe16974 symbol_id: 0x6f5c8275 symbol_id: 0xf182fb15 symbol_id: 0xe44dacb1 @@ -436736,6 +437459,7 @@ interface { symbol_id: 0xae5e5469 symbol_id: 0xc71fde47 symbol_id: 0x62fba41c + symbol_id: 0xd2ca02c1 symbol_id: 0x3a6c45d6 symbol_id: 0x6a18c879 symbol_id: 0x18619e65 @@ -437267,6 +437991,7 @@ interface { symbol_id: 0xac62c748 symbol_id: 0xa2bd1edf symbol_id: 0xa5b4e5b2 + symbol_id: 0x5e49798d symbol_id: 0xca4f3601 symbol_id: 0x420ef2d0 symbol_id: 0xb32b3b17 @@ -437274,6 +437999,7 @@ interface { symbol_id: 0x72c79d80 symbol_id: 0xd333a65c symbol_id: 0xddcff44a + symbol_id: 0x533ca98e symbol_id: 0x0f593caf symbol_id: 0x47bcd15f symbol_id: 0xb6da564f @@ -437521,6 +438247,7 @@ interface { symbol_id: 0xa13f65ff symbol_id: 0xcd2463fd symbol_id: 0xf6c6715e + symbol_id: 0x38d95973 symbol_id: 0x6cb1a35c symbol_id: 0xd0cbbcf3 symbol_id: 0x85ad7f9f @@ -437909,9 +438636,11 @@ interface { symbol_id: 0x49ee3f1e symbol_id: 0x646a1dc6 symbol_id: 0xc40c1fbe + symbol_id: 0xd28f8ce5 symbol_id: 0x96f2cda8 symbol_id: 0x1162870c symbol_id: 0x310e729f + symbol_id: 0xd13a60aa symbol_id: 0x37d3a115 symbol_id: 0x9c9435eb symbol_id: 0x95bdba0c @@ -441476,6 +442205,7 @@ interface { symbol_id: 0x783e3f26 symbol_id: 0xeb2dab5b symbol_id: 0x5516ad40 + symbol_id: 0x9c2bfa6b symbol_id: 0x840bde05 symbol_id: 0x923f5818 symbol_id: 0x41ba71c8 @@ -442540,6 +443270,7 @@ interface { symbol_id: 0xb77321e1 symbol_id: 0x8ef5c221 symbol_id: 0x32b196e0 + symbol_id: 0xfaba855b symbol_id: 0x022517f0 symbol_id: 0x4d8a452e symbol_id: 0x7509c737 @@ -443422,6 +444153,19 @@ interface { symbol_id: 0xfccd15d1 symbol_id: 0x591431f1 symbol_id: 0x6869c83d + symbol_id: 0xe54ea1f3 + symbol_id: 0xa2565005 + symbol_id: 0xfc2dbec4 + symbol_id: 0x08907db0 + symbol_id: 0x81a07067 + symbol_id: 0xb47ee8bd + symbol_id: 0x07f159e7 + symbol_id: 0x94af2b02 + symbol_id: 0xa7d44351 + symbol_id: 0x8e08a107 + symbol_id: 0xe19aecef + symbol_id: 0xc746a415 + symbol_id: 0x175273e3 symbol_id: 0xa8f0fe44 symbol_id: 0xcb1a674c symbol_id: 0x7cfd9233 @@ -443473,6 +444217,7 @@ interface { symbol_id: 0x961122f4 symbol_id: 0xa7da1ac2 symbol_id: 0xe2481840 + symbol_id: 0x21d9778a symbol_id: 0x9fa2aa18 symbol_id: 0x49737cd5 symbol_id: 0x86b2ecdf diff --git a/android/abi_gki_aarch64.stg.allowed_breaks b/android/abi_gki_aarch64.stg.allowed_breaks index 7fd3832d7e2c..8f5f52dbe65a 100644 --- a/android/abi_gki_aarch64.stg.allowed_breaks +++ b/android/abi_gki_aarch64.stg.allowed_breaks @@ -132,3 +132,84 @@ type 'struct io_ring_ctx' changed 1 variable symbol(s) removed 'struct tracepoint __tracepoint_android_vh_filemap_fault_before_folio_locked' +type 'struct kvm_protected_vm' changed + member 'struct maple_tree pinned_pages' was removed + member 'union { struct rb_root_cached pinned_pages; struct { struct maple_tree __unused; }; union { }; }' was added + +type 'struct kvm_hyp_req' changed + member changed from 'union { struct { u8 dest; int nr_pages; int sz_alloc; } mem; struct { unsigned long guest_ipa; size_t size; } map; }' to 'union { struct { u8 dest; int nr_pages; int sz_alloc; } mem; struct { unsigned long guest_ipa; size_t size; } map; struct { unsigned long guest_ipa; size_t size; } split; }' + type changed from 'union { struct { u8 dest; int nr_pages; int sz_alloc; } mem; struct { unsigned long guest_ipa; size_t size; } map; }' to 'union { struct { u8 dest; int nr_pages; int sz_alloc; } mem; struct { unsigned long guest_ipa; size_t size; } map; struct { unsigned long guest_ipa; size_t size; } split; }' + member 'struct { unsigned long guest_ipa; size_t size; } split' was added + +type 'struct scm_fp_list' changed + byte size changed from 2040 to 2048 + member 'short count_unix' was added + +type 'struct scm_fp_list' changed + byte size changed from 2048 to 2064 + member 'struct list_head vertices' was added + member 'short count_unix' changed + offset changed by 128 + +type 'struct scm_fp_list' changed + byte size changed from 2064 to 2072 + member 'struct unix_edge* edges' was added + member 'short count_unix' changed + offset changed by 64 + +type 'struct scm_fp_list' changed + byte size changed from 2072 to 2080 + member 'bool inflight' was added + 3 members ('struct list_head vertices' .. 'short count_unix') changed + offset changed by 64 + +type 'struct unix_edge' changed + byte size changed from 32 to 48 + member 'struct list_head stack_entry' was added + +type 'struct unix_vertex' changed + byte size changed from 40 to 48 + member 'unsigned long index' was added + +type 'struct unix_vertex' changed + byte size changed from 48 to 80 + member 'struct list_head scc_entry' was added + 2 members ('unsigned long out_degree' .. 'unsigned long index') changed + offset changed by 128 + member 'unsigned long lowlink' was added + member 'bool on_stack' was added + +type 'struct unix_sock' changed + member 'struct sock* listener' was added + 4 members ('struct list_head link' .. 'unsigned long gc_flags') changed + offset changed by 64 + +type 'struct unix_vertex' changed + byte size changed from 80 to 72 + member 'bool on_stack' was removed + +type 'struct unix_vertex' changed + member 'unsigned long lowlink' was removed + member 'unsigned long scc_index' was added + +type 'struct unix_sock' changed + byte size changed from 1216 to 1152 + member 'struct list_head link' was removed + member 'unsigned long inflight' was removed + member 'spinlock_t lock' changed + offset changed by -192 + member 'unsigned long gc_flags' was removed + 4 members ('struct socket_wq peer_wq' .. 'struct sk_buff* oob_skb') changed + offset changed by -512 + +type 'struct unix_sock' changed + member 'struct sk_buff* oob_skb' changed + offset changed by 64 + +type 'struct scm_stat' changed + byte size changed from 4 to 16 + member 'unsigned long nr_unix_fds' was added + +type 'struct scm_fp_list' changed + member 'bool dead' was added + diff --git a/android/abi_gki_aarch64_nvidia b/android/abi_gki_aarch64_nvidia new file mode 100644 index 000000000000..2497126b8372 --- /dev/null +++ b/android/abi_gki_aarch64_nvidia @@ -0,0 +1,232 @@ +[abi_symbol_list] +# commonly used symbols + alloc_chrdev_region + alt_cb_patch_nops + __arch_copy_from_user + __arch_copy_to_user + cdev_add + cdev_del + cdev_init + __check_object_size + class_create + class_destroy + complete + dev_driver_string + _dev_err + device_create + device_destroy + _dev_info + devm_kfree + devm_kmalloc + devm_memremap + devm_request_threaded_irq + _dev_warn + fortify_panic + free_irq + __init_swait_queue_head + init_timer_key + __init_waitqueue_head + jiffies_to_usecs + kfree + __kmalloc + kmalloc_caches + kmalloc_trace + kstrtouint + log_post_read_mmio + log_read_mmio + memcpy + __memcpy_fromio + memset + module_layout + __mutex_init + mutex_lock + mutex_unlock + of_find_property + of_property_read_u32_index + of_property_read_variable_u32_array + panic + pid_task + __platform_driver_register + platform_driver_unregister + _printk + __put_task_struct + _raw_spin_lock + _raw_spin_unlock + request_threaded_irq + schedule_timeout + snprintf + __stack_chk_fail + strlen + strncmp + strnlen + strscpy + sysfs_create_group + sysfs_remove_group + system_cpucaps + system_wq + tegra_ivc_notified + tegra_ivc_read_advance + tegra_ivc_read_get_next_frame + tegra_ivc_reset + tegra_ivc_write_advance + tegra_ivc_write_get_next_frame + __traceiter_rwmmio_post_read + __traceiter_rwmmio_read + __tracepoint_rwmmio_post_read + __tracepoint_rwmmio_read + unregister_chrdev_region + __wake_up + __warn_printk + +# required by ivc-cdev.ko + device_del + devm_free_irq + noop_llseek + remap_pfn_range + +# required by ivc_ext.ko + dma_sync_single_for_cpu + __memcpy_toio + +# required by nvsciipc.ko + _dev_notice + __fdget + find_get_pid + fput + platform_device_register_full + platform_device_unregister + sprintf + +# required by tegra_bpmp.ko + clk_hw_determine_rate_no_reparent + clk_hw_get_name + clk_hw_unregister + debugfs_create_dir + debugfs_create_file + debugfs_remove + dentry_path_raw + devm_clk_hw_register + devm_reset_controller_register + dma_alloc_attrs + dma_free_attrs + _find_next_bit + kmalloc_large + kstrdup + ktime_get + of_clk_add_hw_provider + of_device_get_match_data + of_genpd_add_provider_onecell + __of_parse_phandle_with_args + of_platform_default_populate + pm_genpd_init + pm_genpd_remove + seq_lseek + seq_read + seq_write + single_open_size + single_release + strncpy + tegra_bpmp_free_mrq + tegra_bpmp_mrq_is_supported + tegra_bpmp_mrq_return + tegra_bpmp_request_mrq + tegra_bpmp_transfer + tegra_bpmp_transfer_atomic + tegra_sku_info + +# required by tegra_hv.ko + arm64_use_ng_mappings + class_create_file_ns + ioremap_prot + iounmap + irq_get_irq_data + memstart_addr + of_add_property + of_chosen + of_find_compatible_node + of_irq_get + pfn_is_map_memory + tegra_ivc_init + +# required by tegra_hv_pm_ctl.ko + __alloc_skb + find_vpid + finish_wait + init_net + init_wait_entry + msleep + __netlink_kernel_create + netlink_unicast + __nlmsg_put + prepare_to_wait_event + register_pm_notifier + schedule + strcmp + wait_for_completion_timeout + +# required by tegra_hv_vblk_oops.ko + delayed_work_timer_fn + dma_map_page_attrs + __get_free_pages + is_vmalloc_addr + queue_delayed_work_on + +# required by tegra_vblk.ko + blk_execute_rq + blk_mq_alloc_disk_for_queue + blk_mq_alloc_request + blk_mq_alloc_tag_set + blk_mq_destroy_queue + blk_mq_end_request + blk_mq_free_request + blk_mq_free_tag_set + blk_mq_init_queue + blk_mq_start_hw_queues + blk_mq_start_request + blk_mq_stop_hw_queues + blk_queue_flag_set + blk_queue_logical_block_size + blk_queue_max_discard_sectors + blk_queue_max_hw_sectors + blk_queue_max_secure_erase_sectors + blk_queue_physical_block_size + blk_queue_write_cache + __blk_rq_map_sg + capable + __cpu_possible_mask + del_gendisk + device_add_disk + device_create_file + disable_irq + disk_check_media_change + dma_map_sg_attrs + dma_unmap_sg_attrs + enable_irq + _find_first_zero_bit + jiffies + kasan_flag_enabled + kthread_create_on_cpu + kthread_create_on_node + __list_add_valid_or_report + __list_del_entry_valid_or_report + mod_timer + __num_online_cpus + of_find_node_by_name + put_disk + queue_work_on + _raw_spin_lock_irqsave + _raw_spin_unlock_irqrestore + __register_blkdev + sched_setattr_nocheck + set_capacity + set_disk_ro + sg_init_table + sg_nents + __sw_hweight64 + timer_delete + unregister_blkdev + vfree + vzalloc + wait_for_completion + wait_for_completion_interruptible + wake_up_process diff --git a/android/abi_gki_aarch64_pixel b/android/abi_gki_aarch64_pixel index f16966f1a0f3..d64fef8faf50 100644 --- a/android/abi_gki_aarch64_pixel +++ b/android/abi_gki_aarch64_pixel @@ -883,6 +883,7 @@ drm_mode_duplicate drm_mode_equal drm_mode_equal_no_clocks + drm_mode_is_420_only drm_mode_object_find drm_mode_object_get drm_mode_object_put @@ -2620,6 +2621,7 @@ touch_softlockup_watchdog trace_array_destroy trace_array_get_by_name + trace_array_get_by_name_ext trace_array_put trace_array_set_clr_event trace_event_buffer_commit @@ -2731,6 +2733,7 @@ __traceiter_android_vh_ufs_update_sysfs __traceiter_android_vh_usb_dev_resume __traceiter_android_vh_use_amu_fie + __traceiter_android_vh_xhci_full_reset_on_remove __traceiter_clock_set_rate __traceiter_cma_alloc_finish __traceiter_cma_alloc_start @@ -2869,6 +2872,7 @@ __tracepoint_android_vh_ufs_update_sysfs __tracepoint_android_vh_usb_dev_resume __tracepoint_android_vh_use_amu_fie + __tracepoint_android_vh_xhci_full_reset_on_remove __tracepoint_clock_set_rate __tracepoint_cma_alloc_finish __tracepoint_cma_alloc_start diff --git a/android/abi_gki_aarch64_vivo b/android/abi_gki_aarch64_vivo index 9cf149365475..b8f2d60402fd 100644 --- a/android/abi_gki_aarch64_vivo +++ b/android/abi_gki_aarch64_vivo @@ -154,6 +154,8 @@ __traceiter_android_vh_look_around_migrate_folio __traceiter_android_vh_lruvec_add_folio __traceiter_android_vh_lruvec_del_folio + __traceiter_android_vh_mempool_alloc_skip_wait + __traceiter_android_vh_mm_free_page __traceiter_android_vh_mmap_region __traceiter_android_vh_mutex_init __traceiter_android_vh_mutex_unlock_slowpath @@ -284,6 +286,8 @@ __tracepoint_android_vh_look_around_migrate_folio __tracepoint_android_vh_lruvec_add_folio __tracepoint_android_vh_lruvec_del_folio + __tracepoint_android_vh_mempool_alloc_skip_wait + __tracepoint_android_vh_mm_free_page __tracepoint_android_vh_mmap_region __tracepoint_android_vh_mutex_init __tracepoint_android_vh_mutex_unlock_slowpath diff --git a/android/abi_gki_aarch64_xiaomi b/android/abi_gki_aarch64_xiaomi index 66172fb2a848..a8531903d2a7 100644 --- a/android/abi_gki_aarch64_xiaomi +++ b/android/abi_gki_aarch64_xiaomi @@ -23,6 +23,8 @@ __tracepoint_android_vh_tune_swappiness __traceiter_android_vh_do_shrink_slab_ex __tracepoint_android_vh_do_shrink_slab_ex + __traceiter_android_vh_migration_target_bypass + __tracepoint_android_vh_migration_target_bypass # required by lz4 decompress module __tracepoint_android_vh_lz4_decompress_bypass diff --git a/android/abi_gki_aarch64_xiaomi2 b/android/abi_gki_aarch64_xiaomi_xring similarity index 99% rename from android/abi_gki_aarch64_xiaomi2 rename to android/abi_gki_aarch64_xiaomi_xring index c27a275ed937..bbaae6d45165 100644 --- a/android/abi_gki_aarch64_xiaomi2 +++ b/android/abi_gki_aarch64_xiaomi_xring @@ -1911,6 +1911,7 @@ scsi_report_bus_reset scsi_scan_host scsi_unblock_requests + scsi_host_busy sdev_prefix_printk security_file_ioctl select_fallback_rq diff --git a/arch/arm64/configs/gki_defconfig b/arch/arm64/configs/gki_defconfig index 1de2ad0b0bdf..aee331a1430b 100644 --- a/arch/arm64/configs/gki_defconfig +++ b/arch/arm64/configs/gki_defconfig @@ -737,6 +737,7 @@ CONFIG_CRYPTO_LZ4=y CONFIG_CRYPTO_ZSTD=y CONFIG_CRYPTO_ANSI_CPRNG=y CONFIG_CRYPTO_GHASH_ARM64_CE=y +CONFIG_CRYPTO_SHA1_ARM64_CE=y CONFIG_CRYPTO_SHA2_ARM64_CE=y CONFIG_CRYPTO_SHA512_ARM64_CE=y CONFIG_CRYPTO_POLYVAL_ARM64_CE=y diff --git a/arch/arm64/configs/microdroid_defconfig b/arch/arm64/configs/microdroid_defconfig index 5cc6dec65c71..2d67e00899b5 100644 --- a/arch/arm64/configs/microdroid_defconfig +++ b/arch/arm64/configs/microdroid_defconfig @@ -8,6 +8,8 @@ CONFIG_RCU_EXPERT=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=14 +CONFIG_CGROUPS=y +CONFIG_MEMCG=y # CONFIG_RD_GZIP is not set # CONFIG_RD_BZIP2 is not set # CONFIG_RD_LZMA is not set @@ -136,8 +138,10 @@ CONFIG_STATIC_USERMODEHELPER_PATH="" CONFIG_SECURITY_SELINUX=y CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y CONFIG_BUG_ON_DATA_CORRUPTION=y +CONFIG_CRYPTO_SHA1=y CONFIG_CRYPTO_HCTR2=y CONFIG_CRYPTO_LZO=y +CONFIG_CRYPTO_SHA1_ARM64_CE=y CONFIG_CRYPTO_SHA2_ARM64_CE=y CONFIG_CRYPTO_POLYVAL_ARM64_CE=y CONFIG_CRYPTO_AES_ARM64_CE_BLK=y diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 34a2e60525e6..9cc0033513a9 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -83,6 +83,7 @@ enum __kvm_host_smccc_func { __KVM_HOST_SMCCC_FUNC___pkvm_relax_perms, __KVM_HOST_SMCCC_FUNC___pkvm_wrprotect, __KVM_HOST_SMCCC_FUNC___pkvm_dirty_log, + __KVM_HOST_SMCCC_FUNC___pkvm_host_split_guest, __KVM_HOST_SMCCC_FUNC___pkvm_tlb_flush_vmid, __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc, __KVM_HOST_SMCCC_FUNC___kvm_vcpu_run, diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 7336137bf221..e4504048783c 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -224,20 +224,36 @@ struct kvm_smccc_features { }; struct kvm_pinned_page { + union { + struct rb_node node; + struct list_head list_node; + }; struct page *page; u64 ipa; + u64 __subtree_last; u8 order; u16 pins; }; -#define KVM_DUMMY_PPAGE ((struct kvm_pinned_page *)-1) +struct kvm_pinned_page +*kvm_pinned_pages_iter_first(struct rb_root_cached *root, u64 start, u64 end); +struct kvm_pinned_page +*kvm_pinned_pages_iter_next(struct kvm_pinned_page *ppage, u64 start, u64 end); + +#define for_ppage_node_in_range(kvm, start, end, __ppage, __tmp) \ + for (__ppage = kvm_pinned_pages_iter_first(&(kvm)->arch.pkvm.pinned_pages, start, end - 1);\ + __ppage && ({ __tmp = kvm_pinned_pages_iter_next(__ppage, start, end - 1); 1; }); \ + __ppage = __tmp) + +void kvm_pinned_pages_remove(struct kvm_pinned_page *ppage, + struct rb_root_cached *root); typedef unsigned int pkvm_handle_t; struct kvm_protected_vm { pkvm_handle_t handle; struct kvm_hyp_memcache stage2_teardown_mc; - struct maple_tree pinned_pages; + _ANDROID_KABI_REPLACE(struct maple_tree __unused, struct rb_root_cached pinned_pages); gpa_t pvmfw_load_addr; bool enabled; }; @@ -525,6 +541,7 @@ struct kvm_hyp_req { #define KVM_HYP_LAST_REQ 0 #define KVM_HYP_REQ_TYPE_MEM 1 #define KVM_HYP_REQ_TYPE_MAP 2 +#define KVM_HYP_REQ_TYPE_SPLIT 3 u8 type; union { struct { @@ -539,6 +556,12 @@ struct kvm_hyp_req { unsigned long guest_ipa; size_t size; } map; +#ifndef __GENKSYMS__ + struct { + unsigned long guest_ipa; + size_t size; + } split; +#endif }; }; diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 03122dbd5ac2..a174ca1b9e66 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -184,6 +184,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, int kvm_handle_guest_abort(struct kvm_vcpu *vcpu); int pkvm_mem_abort_range(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, size_t size); +int __pkvm_pgtable_stage2_split(struct kvm_vcpu *vcpu, phys_addr_t ipa, size_t size); phys_addr_t kvm_mmu_get_httbr(void); phys_addr_t kvm_get_idmap_vector(void); diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 4c521003ad22..90c647d4b329 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -862,8 +862,7 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size); * kvm_pgtable_stage2_split() is best effort: it tries to break as many * blocks in the input range as allowed by @mc_capacity. */ -int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, - struct kvm_mmu_memory_cache *mc); +int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, void *mc); /** * kvm_pgtable_walk() - Walk a page-table. diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 92d82b2b4bc7..0179275ac299 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -363,6 +363,11 @@ static int handle_hyp_req_map(struct kvm_vcpu *vcpu, return pkvm_mem_abort_range(vcpu, req->map.guest_ipa, req->map.size); } +static int handle_hyp_req_split(struct kvm_vcpu *vcpu, struct kvm_hyp_req *req) +{ + return __pkvm_pgtable_stage2_split(vcpu, req->split.guest_ipa, req->split.size); +} + static int handle_hyp_req(struct kvm_vcpu *vcpu) { struct kvm_hyp_req *hyp_req = vcpu->arch.hyp_reqs; @@ -379,6 +384,9 @@ static int handle_hyp_req(struct kvm_vcpu *vcpu) case KVM_HYP_REQ_TYPE_MAP: ret = handle_hyp_req_map(vcpu, hyp_req); break; + case KVM_HYP_REQ_TYPE_SPLIT: + ret = handle_hyp_req_split(vcpu, hyp_req); + break; default: pr_warn("Unknown kvm_hyp_req type: %d\n", hyp_req->type); ret = -EINVAL; diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 40d9b7341097..5c02f8c8fb06 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -63,6 +63,7 @@ int __pkvm_host_unuse_dma(u64 phys_addr, size_t size); int __pkvm_guest_stage2_snapshot(struct kvm_pgtable_snapshot *snap, struct pkvm_hyp_vm *vm); int __pkvm_host_stage2_snapshot(struct kvm_pgtable_snapshot *snap); int __pkvm_host_lazy_pte(u64 pfn, u64 nr_pages, bool enable); +int __pkvm_host_split_guest(u64 pfn, u64 gfn, u64 size, struct pkvm_hyp_vcpu *vcpu); bool addr_is_memory(phys_addr_t phys); int host_stage2_idmap_locked(phys_addr_t addr, u64 size, diff --git a/arch/arm64/kvm/hyp/nvhe/alloc.c b/arch/arm64/kvm/hyp/nvhe/alloc.c index dc1a7371b694..13f77784e3be 100644 --- a/arch/arm64/kvm/hyp/nvhe/alloc.c +++ b/arch/arm64/kvm/hyp/nvhe/alloc.c @@ -556,7 +556,7 @@ void *hyp_alloc(size_t size) unsigned long chunk_addr; int missing_map, ret = 0; - size = ALIGN(size, MIN_ALLOC); + size = ALIGN(size ?: MIN_ALLOC, MIN_ALLOC); hyp_spin_lock(&allocator->lock); diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index e8c484dcc769..a5ffc4cd3f70 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -1073,6 +1073,27 @@ out: cpu_reg(host_ctxt, 1) = ret; } +static void handle___pkvm_host_split_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, pfn, host_ctxt, 1); + DECLARE_REG(u64, gfn, host_ctxt, 2); + DECLARE_REG(u64, size, host_ctxt, 3); + struct pkvm_hyp_vcpu *hyp_vcpu; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + if (!hyp_vcpu) + goto out; + + ret = __pkvm_host_split_guest(pfn, gfn, size, hyp_vcpu); + +out: + cpu_reg(host_ctxt, 1) = ret; +} + static void handle___kvm_adjust_pc(struct kvm_cpu_context *host_ctxt) { struct pkvm_hyp_vcpu *hyp_vcpu; @@ -1618,6 +1639,7 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__pkvm_relax_perms), HANDLE_FUNC(__pkvm_wrprotect), HANDLE_FUNC(__pkvm_dirty_log), + HANDLE_FUNC(__pkvm_host_split_guest), HANDLE_FUNC(__pkvm_tlb_flush_vmid), HANDLE_FUNC(__kvm_adjust_pc), HANDLE_FUNC(__kvm_vcpu_run), diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index c95a5e896251..bc1f8cb3faf3 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -387,6 +387,10 @@ static int relinquish_walker(const struct kvm_pgtable_visit_ctx *ctx, if (!kvm_pte_valid(pte)) return 0; + /* We don't support splitting non-leaf mappings */ + if (ctx->level != (KVM_PGTABLE_MAX_LEVELS - 1)) + return -E2BIG; + state = pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte)); if (state != data->expected_state) return -EPERM; @@ -433,8 +437,7 @@ int __pkvm_guest_relinquish_to_host(struct pkvm_hyp_vcpu *vcpu, goto end; /* Zap the guest stage2 pte and return ownership to the host */ - ret = kvm_pgtable_stage2_annotate(&vm->pgt, ipa, PAGE_SIZE, - &vcpu->vcpu.arch.stage2_mc, 0); + ret = kvm_pgtable_stage2_unmap(&vm->pgt, ipa, PAGE_SIZE); if (ret) goto end; @@ -2760,6 +2763,30 @@ unlock: } +int __pkvm_host_split_guest(u64 pfn, u64 gfn, u64 size, struct pkvm_hyp_vcpu *vcpu) +{ + struct kvm_hyp_memcache *mc = &vcpu->vcpu.arch.stage2_mc; + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + u64 ipa = hyp_pfn_to_phys(gfn); + int ret; + + if (size != PMD_SIZE) + return -EINVAL; + + guest_lock_component(vm); + + /* + * stage2_split() already checks the existing mapping is valid and PMD-level. + * No other check is necessary. + */ + + ret = kvm_pgtable_stage2_split(&vm->pgt, ipa, size, mc); + + guest_unlock_component(vm); + + return ret; +} + int __pkvm_host_donate_guest(struct pkvm_hyp_vcpu *vcpu, u64 pfn, u64 gfn, u64 nr_pages) { diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index fa4c40b0a172..feaad44fe204 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -702,16 +702,13 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, if (ret) goto done; - ret = pkvm_vcpu_init_psci(hyp_vcpu); - if (ret) - goto done; - if (test_bit(KVM_ARM_VCPU_SVE, hyp_vcpu->vcpu.arch.features)) { ret = init_pkvm_hyp_vcpu_sve(hyp_vcpu, host_vcpu); if (ret) goto done; } + WARN_ON(pkvm_vcpu_init_psci(hyp_vcpu)); pkvm_vcpu_init_traps(hyp_vcpu); kvm_reset_pvm_sys_regs(&hyp_vcpu->vcpu); done: @@ -1588,9 +1585,19 @@ static bool pkvm_memrelinquish_call(struct pkvm_hyp_vcpu *hyp_vcpu, goto out_guest_err; ret = __pkvm_guest_relinquish_to_host(hyp_vcpu, ipa, &pa); - if (ret == -ENOMEM) { - if (pkvm_handle_empty_memcache(hyp_vcpu, exit_code)) + if (ret == -E2BIG) { + struct kvm_hyp_req *req = pkvm_hyp_req_reserve(hyp_vcpu, KVM_HYP_REQ_TYPE_SPLIT); + + if (!req) { + ret = -ENOMEM; goto out_guest_err; + } + + req->split.guest_ipa = ALIGN_DOWN(ipa, PMD_SIZE); + req->split.size = PMD_SIZE; + + write_sysreg_el2(read_sysreg_el2(SYS_ELR) - 4, SYS_ELR); + *exit_code = ARM_EXCEPTION_HYP_REQ; return false; } else if (ret) { diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index d337e9349a65..3dab9deb3415 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -1769,13 +1769,49 @@ static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx, return 0; } -int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, - struct kvm_mmu_memory_cache *mc) +static int pkvm_stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) { + struct stage2_map_data *data = ctx->arg; + struct kvm_pgtable *pgt = data->mmu->pgt; + struct kvm_hyp_memcache *mc = data->memcache; + enum kvm_pgtable_prot prot; + kvm_pte_t pte = ctx->old; + kvm_pte_t *childp; + + if (ctx->level == KVM_PGTABLE_MAX_LEVELS - 1) + return 0; + + /* We can only split PMD-level blocks */ + if (!kvm_pte_valid(pte) || ctx->level != KVM_PGTABLE_MAX_LEVELS - 2) + return -EINVAL; + + prot = kvm_pgtable_stage2_pte_prot(pte); + childp = kvm_pgtable_stage2_create_unlinked(pgt, kvm_pte_to_phys(pte), + ctx->level, prot, mc, true); + if (IS_ERR(childp)) + return PTR_ERR(childp); + + WARN_ON(!stage2_try_break_pte(ctx, data->mmu)); + + stage2_make_pte(ctx, kvm_init_table_pte(childp, ctx->mm_ops)); + dsb(ishst); + + return 0; +} + +int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, void *mc) +{ + struct stage2_map_data data = { + .mmu = pgt->mmu, + .memcache = mc, + }; struct kvm_pgtable_walker walker = { - .cb = stage2_split_walker, + .cb = static_branch_unlikely(&kvm_protected_mode_initialized) ? + pkvm_stage2_split_walker : stage2_split_walker, + .arg = static_branch_unlikely(&kvm_protected_mode_initialized) ? + &data : mc, .flags = KVM_PGTABLE_WALK_LEAF, - .arg = mc, }; return kvm_pgtable_walk(pgt, addr, size, &walker); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index c6ec30a19b3b..38d25bab1057 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -6,11 +6,11 @@ #include #include -#include #include #include #include #include +#include #include #include #include @@ -291,6 +291,20 @@ static void invalidate_icache_guest_page(void *va, size_t size) __invalidate_icache_guest_page(va, size); } +static u64 __pinned_page_start(struct kvm_pinned_page *ppage) +{ + return ppage->ipa; +} + +static u64 __pinned_page_end(struct kvm_pinned_page *ppage) +{ + return ppage->ipa + (1 << (ppage->order + PAGE_SHIFT)) - 1; +} + +INTERVAL_TREE_DEFINE(struct kvm_pinned_page, node, u64, __subtree_last, + __pinned_page_start, __pinned_page_end, /* empty */, + kvm_pinned_pages); + static int __pkvm_unmap_guest_call(u64 pfn, u64 gfn, u8 order, void *args) { struct kvm *kvm = args; @@ -312,7 +326,7 @@ static int pkvm_unmap_guest(struct kvm *kvm, struct kvm_pinned_page *ppage) * no update needed from here. */ unpin_user_pages(&ppage->page, 1); - mtree_erase(&kvm->arch.pkvm.pinned_pages, ppage->ipa); + kvm_pinned_pages_remove(ppage, &kvm->arch.pkvm.pinned_pages); kfree(ppage); return 0; @@ -320,17 +334,12 @@ static int pkvm_unmap_guest(struct kvm *kvm, struct kvm_pinned_page *ppage) static int pkvm_unmap_range(struct kvm *kvm, u64 start, u64 end) { + struct kvm_pinned_page *ppage, *tmp; struct mm_struct *mm = kvm->mm; - unsigned long index = start; unsigned long cnt = 0; - void *entry; int ret = 0; - mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, end - 1) { - struct kvm_pinned_page *ppage = entry; - - if (ppage == KVM_DUMMY_PPAGE) - continue; + for_ppage_node_in_range(kvm, start, end, ppage, tmp) { ret = pkvm_unmap_guest(kvm, ppage); if (ret) break; @@ -418,8 +427,7 @@ static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 si static void pkvm_stage2_flush(struct kvm *kvm) { - unsigned long index = 0; - void *entry; + struct kvm_pinned_page *ppage, *tmp; /* * Contrary to stage2_apply_range(), we don't need to check @@ -427,11 +435,7 @@ static void pkvm_stage2_flush(struct kvm *kvm) * from a vcpu thread, and the list is only ever freed on VM * destroy (which only occurs when all vcpu are gone). */ - mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, ULONG_MAX) { - struct kvm_pinned_page *ppage = entry; - - if (ppage == KVM_DUMMY_PPAGE) - continue; + for_ppage_node_in_range(kvm, 0, ULONG_MAX, ppage, tmp) { __clean_dcache_guest_page(page_address(ppage->page), PAGE_SIZE); cond_resched_rwlock_write(&kvm->mmu_lock); } @@ -1014,7 +1018,6 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); - mt_init_flags(&kvm->arch.pkvm.pinned_pages, MT_FLAGS_USE_RCU); mmu->arch = &kvm->arch; if (is_protected_kvm_enabled()) @@ -1293,18 +1296,13 @@ static int __pkvm_wrprotect_call(u64 pfn, u64 gfn, u8 order, void *args) static int pkvm_wp_range(struct kvm *kvm, u64 start, u64 end) { - unsigned long index = start; - void *entry; + struct kvm_pinned_page *ppage, *tmp; - mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, end - 1) { - struct kvm_pinned_page *ppage = entry; + for_ppage_node_in_range(kvm, start, end, ppage, tmp) { int ret; - if (ppage == KVM_DUMMY_PPAGE) - continue; ret = pkvm_call_hyp_nvhe_ppage(ppage, __pkvm_wrprotect_call, kvm, false); - if (ret) return ret; } @@ -1630,28 +1628,9 @@ static int pkvm_host_map_guest(u64 pfn, u64 gfn, u64 nr_pages, return (ret == -EPERM) ? -EAGAIN : ret; } -static struct kvm_pinned_page * -find_ppage_or_above(struct kvm *kvm, phys_addr_t ipa) -{ - unsigned long index = ipa; - void *entry; - - mt_for_each(&kvm->arch.pkvm.pinned_pages, entry, index, ULONG_MAX) { - if (entry == KVM_DUMMY_PPAGE) - continue; - return entry; - } - - return NULL; -} - static struct kvm_pinned_page *find_ppage(struct kvm *kvm, u64 ipa) { - struct kvm_pinned_page *ppage; - unsigned long index = ipa; - - ppage = mt_find(&kvm->arch.pkvm.pinned_pages, &index, ipa + PAGE_SIZE - 1); - return ppage == KVM_DUMMY_PPAGE ? NULL : ppage; + return kvm_pinned_pages_iter_first(&kvm->arch.pkvm.pinned_pages, ipa, ipa + PAGE_SIZE - 1); } static int __pkvm_relax_perms_call(u64 pfn, u64 gfn, u8 order, void *args) @@ -1707,11 +1686,10 @@ static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t *fault_ipa, { unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE; struct kvm_hyp_memcache *hyp_memcache = &vcpu->arch.stage2_mc; - unsigned long index, pmd_offset, page_size, end; + unsigned long page_size = PAGE_SIZE; struct mm_struct *mm = current->mm; struct kvm_pinned_page *ppage; struct kvm *kvm = vcpu->kvm; - struct maple_tree *mt = &kvm->arch.pkvm.pinned_pages; int ret, nr_pages; struct page *page; u64 pfn; @@ -1760,66 +1738,49 @@ static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t *fault_ipa, } pfn = page_to_pfn(page); - pmd_offset = *fault_ipa & (PMD_SIZE - 1); - page_size = transparent_hugepage_adjust(kvm, memslot, - hva, &pfn, - fault_ipa); - page = pfn_to_page(pfn); -retry: - if (size) - *size = page_size; + read_lock(&kvm->mmu_lock); + if (!kvm_pinned_pages_iter_first(&vcpu->kvm->arch.pkvm.pinned_pages, + ALIGN_DOWN(*fault_ipa, PMD_SIZE), + ALIGN(*fault_ipa + 1, PMD_SIZE) - 1)) + page_size = transparent_hugepage_adjust(kvm, memslot, hva, &pfn, fault_ipa); + + /* + * We take the risk of racing with another vCPU, but sync will be restored by the + * host_map_guest HVC + */ + read_unlock(&kvm->mmu_lock); + + page = pfn_to_page(pfn); ret = account_locked_vm(mm, page_size >> PAGE_SHIFT, true); if (ret) goto unpin; - index = *fault_ipa; - end = index + page_size - 1; ppage->page = page; ppage->ipa = *fault_ipa; ppage->order = get_order(page_size); ppage->pins = 1 << ppage->order; - /* - * If we already have a mapping in the middle of the THP, we have no - * other choice than enforcing PAGE_SIZE for pkvm_host_map_guest() to - * succeed. - */ - if (page_size > PAGE_SIZE && mt_find(mt, &index, end)) { - *fault_ipa += pmd_offset; - pfn += pmd_offset >> PAGE_SHIFT; - page = pfn_to_page(pfn); - account_locked_vm(mm, page_size >> PAGE_SHIFT, false); - page_size = PAGE_SIZE; - goto retry; - } - - /* Reserve space in the mtree */ - ret = mtree_insert_range(mt, index, end, KVM_DUMMY_PPAGE, GFP_KERNEL); - if (ret) { - if (ret == -EEXIST) - ret = 0; - goto dec_account; - } - write_lock(&kvm->mmu_lock); ret = pkvm_host_map_guest(pfn, *fault_ipa >> PAGE_SHIFT, page_size >> PAGE_SHIFT, KVM_PGTABLE_PROT_R); if (ret) { - if (WARN_ON(ret == -EAGAIN)) + if (ret == -EAGAIN) ret = 0; goto err_unlock; } - WARN_ON(mtree_store_range(mt, index, end, ppage, GFP_ATOMIC)); + kvm_pinned_pages_insert(ppage, &kvm->arch.pkvm.pinned_pages); write_unlock(&kvm->mmu_lock); + if (size) + *size = page_size; + return 0; err_unlock: write_unlock(&kvm->mmu_lock); -dec_account: account_locked_vm(mm, page_size >> PAGE_SHIFT, false); unpin: unpin_user_pages(&page, 1); @@ -1847,13 +1808,13 @@ int pkvm_mem_abort_range(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, size_t si idx = srcu_read_lock(&vcpu->kvm->srcu); read_lock(&vcpu->kvm->mmu_lock); - ppage = find_ppage_or_above(vcpu->kvm, fault_ipa); + ppage = kvm_pinned_pages_iter_first(&vcpu->kvm->arch.pkvm.pinned_pages, + fault_ipa, ipa_end); while (fault_ipa < ipa_end) { - if (ppage && ppage != KVM_DUMMY_PPAGE && ppage->ipa == fault_ipa) { + if (ppage && ppage->ipa == fault_ipa) { page_size = PAGE_SIZE << ppage->order; - ppage = mt_next(&vcpu->kvm->arch.pkvm.pinned_pages, - ppage->ipa, ULONG_MAX); + ppage = kvm_pinned_pages_iter_next(ppage, fault_ipa, ipa_end); } else { gfn_t gfn = gpa_to_gfn(fault_ipa); struct kvm_memory_slot *memslot; @@ -1877,7 +1838,8 @@ int pkvm_mem_abort_range(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, size_t si * We had to release the mmu_lock so let's update the * reference. */ - ppage = find_ppage_or_above(vcpu->kvm, fault_ipa + page_size); + ppage = kvm_pinned_pages_iter_first(&vcpu->kvm->arch.pkvm.pinned_pages, + fault_ipa + PAGE_SIZE, ipa_end); } fault_ipa += page_size; @@ -1889,6 +1851,162 @@ end: return err; } +static int __pkvm_pin_user_pages(struct kvm *kvm, struct kvm_memory_slot *memslot, + u64 gfn, u64 nr_pages, struct page ***__pages) +{ + unsigned long hva = gfn_to_hva_memslot_prot(memslot, gfn, NULL); + unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE; + struct mm_struct *mm = current->mm; + struct page **pages; + long ret; + int p; + + pages = kmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + mmap_read_lock(mm); + ret = pin_user_pages(hva, nr_pages, flags, pages); + mmap_read_unlock(mm); + + if (ret == -EHWPOISON) { + kvm_send_hwpoison_signal(hva, PAGE_SHIFT); + goto err_free_pages; + } else if (ret == -EFAULT) { + /* Will try MMIO map */ + ret = -EREMOTEIO; + goto err_free_pages; + } else if (ret < 0) { + ret = -EFAULT; + goto err_free_pages; + } else if (ret != nr_pages) { + nr_pages = ret; + ret = -EFAULT; + goto err_unpin_pages; + } + + /* See PageSwapBacked() in pkvm_mem_abort() */ + for (p = 0; p < nr_pages; p++) { + if (!folio_test_swapbacked(page_folio(pages[p]))) { + ret = -EIO; + goto err_unpin_pages; + } + } + + *__pages = pages; + return 0; + +err_unpin_pages: + unpin_user_pages(pages, nr_pages); +err_free_pages: + kfree(pages); + return ret; +} + +/* + * Splitting is only expected on the back of a relinquish guest HVC in the pKVM case, while + * pkvm_pgtable_stage2_split() can be called with dirty logging. + */ +int __pkvm_pgtable_stage2_split(struct kvm_vcpu *vcpu, phys_addr_t ipa, size_t size) +{ + struct list_head ppage_prealloc = LIST_HEAD_INIT(ppage_prealloc); + struct kvm_hyp_memcache *hyp_memcache = &vcpu->arch.stage2_mc; + struct kvm_pinned_page *ppage, *tmp; + struct kvm_memory_slot *memslot; + struct kvm *kvm = vcpu->kvm; + int idx, p, ret, nr_pages; + struct page **pages; + kvm_pfn_t pfn; + gfn_t gfn; + + if (!IS_ALIGNED(ipa, PMD_SIZE) || size != PMD_SIZE) + return -EINVAL; + + if (!hyp_memcache->nr_pages) { + ret = topup_hyp_memcache(hyp_memcache, 1, 0); + if (ret) + return -ENOMEM; + + atomic64_add(PAGE_SIZE, &kvm->stat.protected_hyp_mem); + atomic64_add(PAGE_SIZE, &kvm->stat.protected_pgtable_mem); + } + + /* We already have 1 pin on the Huge Page */ + nr_pages = (size >> PAGE_SHIFT) - 1; + gfn = (ipa >> PAGE_SHIFT) + 1; + + /* Pre-allocate kvm_pinned_page before acquiring the mmu_lock */ + for (p = 0; p < nr_pages; p++) { + ppage = kzalloc(sizeof(*ppage), GFP_KERNEL_ACCOUNT); + if (!ppage) { + ret = -ENOMEM; + goto free_pinned_pages; + } + list_add(&ppage->list_node, &ppage_prealloc); + } + + idx = srcu_read_lock(&vcpu->kvm->srcu); + memslot = gfn_to_memslot(vcpu->kvm, gfn); + ret = __pkvm_pin_user_pages(kvm, memslot, gfn, nr_pages, &pages); + if (ret) + goto unlock_srcu; + + write_lock(&kvm->mmu_lock); + + ppage = find_ppage(kvm, ipa); + if (!ppage) { + ret = -EPERM; + goto end; + } else if (!ppage->order) { + ret = 0; + goto end; + } + + ret = kvm_call_hyp_nvhe(__pkvm_host_split_guest, page_to_pfn(ppage->page), + ipa >> PAGE_SHIFT, size); + if (ret) + goto end; + + ppage->order = 0; + ppage->pins = 1; + + pfn = page_to_pfn(ppage->page) + 1; + ipa = ipa + PAGE_SIZE; + while (nr_pages--) { + /* Pop a ppage from the pre-allocated list */ + ppage = list_first_entry(&ppage_prealloc, struct kvm_pinned_page, list_node); + list_del_init(&ppage->list_node); + + ppage->page = pfn_to_page(pfn); + ppage->ipa = ipa; + ppage->order = 0; + ppage->pins = 1; + kvm_pinned_pages_insert(ppage, &kvm->arch.pkvm.pinned_pages); + + pfn += 1; + ipa += PAGE_SIZE; + } + +end: + write_unlock(&kvm->mmu_lock); + + if (ret) + unpin_user_pages(pages, nr_pages); + kfree(pages); + +unlock_srcu: + srcu_read_unlock(&vcpu->kvm->srcu, idx); + +free_pinned_pages: + /* Free unused pre-allocated kvm_pinned_page */ + list_for_each_entry_safe(ppage, tmp, &ppage_prealloc, list_node) { + list_del(&ppage->list_node); + kfree(ppage); + } + + return ret; +} + static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_memory_slot *memslot, unsigned long hva, unsigned long fault_status) diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index 1b9334136f8e..4523cc6f2725 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -319,21 +319,17 @@ static int __reclaim_dying_guest_page_call(u64 pfn, u64 gfn, u8 order, void *arg static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm) { + struct kvm_pinned_page *tmp, *ppage; struct mm_struct *mm = current->mm; - struct kvm_pinned_page *ppage; struct kvm_vcpu *host_vcpu; - unsigned long idx, ipa = 0; + unsigned long idx; if (!host_kvm->arch.pkvm.handle) goto out_free; WARN_ON(kvm_call_hyp_nvhe(__pkvm_start_teardown_vm, host_kvm->arch.pkvm.handle)); - mt_clear_in_rcu(&host_kvm->arch.pkvm.pinned_pages); - - mt_for_each(&host_kvm->arch.pkvm.pinned_pages, ppage, ipa, ULONG_MAX) { - if (WARN_ON(ppage == KVM_DUMMY_PPAGE)) - continue; + for_ppage_node_in_range(host_kvm, 0, ULONG_MAX, ppage, tmp) { WARN_ON(pkvm_call_hyp_nvhe_ppage(ppage, __reclaim_dying_guest_page_call, host_kvm, true)); @@ -341,9 +337,9 @@ static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm) account_locked_vm(mm, 1, false); unpin_user_pages_dirty_lock(&ppage->page, 1, host_kvm->arch.pkvm.enabled); + kvm_pinned_pages_remove(ppage, &host_kvm->arch.pkvm.pinned_pages); kfree(ppage); } - mtree_destroy(&host_kvm->arch.pkvm.pinned_pages); WARN_ON(kvm_call_hyp_nvhe(__pkvm_finalize_teardown_vm, host_kvm->arch.pkvm.handle)); @@ -538,21 +534,21 @@ void pkvm_host_reclaim_page(struct kvm *host_kvm, phys_addr_t ipa) { struct mm_struct *mm = current->mm; struct kvm_pinned_page *ppage; - unsigned long index = ipa; u16 pins; write_lock(&host_kvm->mmu_lock); - ppage = mt_find(&host_kvm->arch.pkvm.pinned_pages, &index, - index + PAGE_SIZE - 1); - if (ppage && ppage != KVM_DUMMY_PPAGE) { + ppage = kvm_pinned_pages_iter_first(&host_kvm->arch.pkvm.pinned_pages, + ipa, ipa + PAGE_SIZE - 1); + if (ppage) { + WARN_ON_ONCE(ppage->pins != 1); + if (ppage->pins) ppage->pins--; - else - WARN_ON(1); pins = ppage->pins; if (!pins) - mtree_erase(&host_kvm->arch.pkvm.pinned_pages, ipa); + kvm_pinned_pages_remove(ppage, + &host_kvm->arch.pkvm.pinned_pages); } write_unlock(&host_kvm->mmu_lock); diff --git a/arch/x86/configs/gki_defconfig b/arch/x86/configs/gki_defconfig index 5da55f8d4a16..c7bd6055c20b 100644 --- a/arch/x86/configs/gki_defconfig +++ b/arch/x86/configs/gki_defconfig @@ -672,6 +672,7 @@ CONFIG_CRYPTO_ZSTD=y CONFIG_CRYPTO_ANSI_CPRNG=y CONFIG_CRYPTO_AES_NI_INTEL=y CONFIG_CRYPTO_POLYVAL_CLMUL_NI=y +CONFIG_CRYPTO_SHA1_SSSE3=y CONFIG_CRYPTO_SHA256_SSSE3=y CONFIG_CRYPTO_SHA512_SSSE3=y CONFIG_CRC_CCITT=y diff --git a/arch/x86/configs/microdroid_defconfig b/arch/x86/configs/microdroid_defconfig index d58c3dc697d0..603199122fa2 100644 --- a/arch/x86/configs/microdroid_defconfig +++ b/arch/x86/configs/microdroid_defconfig @@ -14,12 +14,6 @@ CONFIG_UCLAMP_TASK=y CONFIG_UCLAMP_BUCKETS_COUNT=20 CONFIG_CGROUPS=y CONFIG_MEMCG=y -CONFIG_BLK_CGROUP=y -CONFIG_CGROUP_SCHED=y -CONFIG_UCLAMP_TASK_GROUP=y -CONFIG_CGROUP_FREEZER=y -CONFIG_CPUSETS=y -CONFIG_CGROUP_CPUACCT=y # CONFIG_RD_BZIP2 is not set # CONFIG_RD_LZMA is not set # CONFIG_RD_XZ is not set @@ -47,7 +41,6 @@ CONFIG_CPU_FREQ_GOV_POWERSAVE=y CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y CONFIG_JUMP_LABEL=y # CONFIG_BLOCK_LEGACY_AUTOLOAD is not set -CONFIG_BLK_CGROUP_IOCOST=y CONFIG_PARTITION_ADVANCED=y # CONFIG_MSDOS_PARTITION is not set # CONFIG_MQ_IOSCHED_DEADLINE is not set @@ -209,6 +202,7 @@ CONFIG_CRYPTO_HCTR2=y CONFIG_CRYPTO_LZO=y CONFIG_CRYPTO_AES_NI_INTEL=y CONFIG_CRYPTO_POLYVAL_CLMUL_NI=y +CONFIG_CRYPTO_SHA1_SSSE3=y CONFIG_CRYPTO_SHA256_SSSE3=y CONFIG_CRYPTO_SHA512_SSSE3=y CONFIG_PRINTK_TIME=y diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 929ae45e0745..f5414f11d4f2 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -6642,10 +6642,10 @@ static void print_binder_transaction_ilocked(struct seq_file *m, } static void print_binder_work_ilocked(struct seq_file *m, - struct binder_proc *proc, - const char *prefix, - const char *transaction_prefix, - struct binder_work *w) + struct binder_proc *proc, + const char *prefix, + const char *transaction_prefix, + struct binder_work *w, bool hash_ptrs) { struct binder_node *node; struct binder_transaction *t; @@ -6668,9 +6668,15 @@ static void print_binder_work_ilocked(struct seq_file *m, break; case BINDER_WORK_NODE: node = container_of(w, struct binder_node, work); - seq_printf(m, "%snode work %d: u%016llx c%016llx\n", - prefix, node->debug_id, - (u64)node->ptr, (u64)node->cookie); + if (hash_ptrs) + seq_printf(m, "%snode work %d: u%p c%p\n", + prefix, node->debug_id, + (void *)(long)node->ptr, + (void *)(long)node->cookie); + else + seq_printf(m, "%snode work %d: u%016llx c%016llx\n", + prefix, node->debug_id, + (u64)node->ptr, (u64)node->cookie); break; case BINDER_WORK_DEAD_BINDER: seq_printf(m, "%shas dead binder\n", prefix); @@ -6695,7 +6701,7 @@ static void print_binder_work_ilocked(struct seq_file *m, static void print_binder_thread_ilocked(struct seq_file *m, struct binder_thread *thread, - int print_always) + bool print_always, bool hash_ptrs) { struct binder_transaction *t; struct binder_work *w; @@ -6725,14 +6731,16 @@ static void print_binder_thread_ilocked(struct seq_file *m, } list_for_each_entry(w, &thread->todo, entry) { print_binder_work_ilocked(m, thread->proc, " ", - " pending transaction", w); + " pending transaction", + w, hash_ptrs); } if (!print_always && m->count == header_pos) m->count = start_pos; } static void print_binder_node_nilocked(struct seq_file *m, - struct binder_node *node) + struct binder_node *node, + bool hash_ptrs) { struct binder_ref *ref; struct binder_work *w; @@ -6742,8 +6750,13 @@ static void print_binder_node_nilocked(struct seq_file *m, hlist_for_each_entry(ref, &node->refs, node_entry) count++; - seq_printf(m, " node %d: u%016llx c%016llx pri %d:%d hs %d hw %d ls %d lw %d is %d iw %d tr %d", - node->debug_id, (u64)node->ptr, (u64)node->cookie, + if (hash_ptrs) + seq_printf(m, " node %d: u%p c%p", node->debug_id, + (void *)(long)node->ptr, (void *)(long)node->cookie); + else + seq_printf(m, " node %d: u%016llx c%016llx", node->debug_id, + (u64)node->ptr, (u64)node->cookie); + seq_printf(m, " pri %d:%d hs %d hw %d ls %d lw %d is %d iw %d tr %d", node->sched_policy, node->min_priority, node->has_strong_ref, node->has_weak_ref, node->local_strong_refs, node->local_weak_refs, @@ -6757,7 +6770,8 @@ static void print_binder_node_nilocked(struct seq_file *m, if (node->proc) { list_for_each_entry(w, &node->async_todo, entry) print_binder_work_ilocked(m, node->proc, " ", - " pending async transaction", w); + " pending async transaction", + w, hash_ptrs); } } @@ -6773,8 +6787,54 @@ static void print_binder_ref_olocked(struct seq_file *m, binder_node_unlock(ref->node); } -static void print_binder_proc(struct seq_file *m, - struct binder_proc *proc, int print_all) +/** + * print_next_binder_node_ilocked() - Print binder_node from a locked list + * @m: struct seq_file for output via seq_printf() + * @proc: struct binder_proc we hold the inner_proc_lock to (if any) + * @node: struct binder_node to print fields of + * @prev_node: struct binder_node we hold a temporary reference to (if any) + * @hash_ptrs: whether to hash @node's binder_uintptr_t fields + * + * Helper function to handle synchronization around printing a struct + * binder_node while iterating through @proc->nodes or the dead nodes list. + * Caller must hold either @proc->inner_lock (for live nodes) or + * binder_dead_nodes_lock. This lock will be released during the body of this + * function, but it will be reacquired before returning to the caller. + * + * Return: pointer to the struct binder_node we hold a tmpref on + */ +static struct binder_node * +print_next_binder_node_ilocked(struct seq_file *m, struct binder_proc *proc, + struct binder_node *node, + struct binder_node *prev_node, bool hash_ptrs) +{ + /* + * Take a temporary reference on the node so that isn't freed while + * we print it. + */ + binder_inc_node_tmpref_ilocked(node); + /* + * Live nodes need to drop the inner proc lock and dead nodes need to + * drop the binder_dead_nodes_lock before trying to take the node lock. + */ + if (proc) + binder_inner_proc_unlock(proc); + else + spin_unlock(&binder_dead_nodes_lock); + if (prev_node) + binder_put_node(prev_node); + binder_node_inner_lock(node); + print_binder_node_nilocked(m, node, hash_ptrs); + binder_node_inner_unlock(node); + if (proc) + binder_inner_proc_lock(proc); + else + spin_lock(&binder_dead_nodes_lock); + return node; +} + +static void print_binder_proc(struct seq_file *m, struct binder_proc *proc, + bool print_all, bool hash_ptrs) { struct binder_work *w; struct rb_node *n; @@ -6787,31 +6847,19 @@ static void print_binder_proc(struct seq_file *m, header_pos = m->count; binder_inner_proc_lock(proc); - for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) + for (n = rb_first(&proc->threads); n; n = rb_next(n)) print_binder_thread_ilocked(m, rb_entry(n, struct binder_thread, - rb_node), print_all); + rb_node), print_all, hash_ptrs); - for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) { + for (n = rb_first(&proc->nodes); n; n = rb_next(n)) { struct binder_node *node = rb_entry(n, struct binder_node, rb_node); if (!print_all && !node->has_async_transaction) continue; - /* - * take a temporary reference on the node so it - * survives and isn't removed from the tree - * while we print it. - */ - binder_inc_node_tmpref_ilocked(node); - /* Need to drop inner lock to take node lock */ - binder_inner_proc_unlock(proc); - if (last_node) - binder_put_node(last_node); - binder_node_inner_lock(node); - print_binder_node_nilocked(m, node); - binder_node_inner_unlock(node); - last_node = node; - binder_inner_proc_lock(proc); + last_node = print_next_binder_node_ilocked(m, proc, node, + last_node, + hash_ptrs); } binder_inner_proc_unlock(proc); if (last_node) @@ -6819,19 +6867,18 @@ static void print_binder_proc(struct seq_file *m, if (print_all) { binder_proc_lock(proc); - for (n = rb_first(&proc->refs_by_desc); - n != NULL; - n = rb_next(n)) + for (n = rb_first(&proc->refs_by_desc); n; n = rb_next(n)) print_binder_ref_olocked(m, rb_entry(n, - struct binder_ref, - rb_node_desc)); + struct binder_ref, + rb_node_desc)); binder_proc_unlock(proc); } binder_alloc_print_allocated(m, &proc->alloc); binder_inner_proc_lock(proc); list_for_each_entry(w, &proc->todo, entry) print_binder_work_ilocked(m, proc, " ", - " pending transaction", w); + " pending transaction", w, + hash_ptrs); list_for_each_entry(w, &proc->delivered_death, entry) { seq_puts(m, " has delivered dead binder\n"); break; @@ -6958,7 +7005,7 @@ static void print_binder_proc_stats(struct seq_file *m, count = 0; ready_threads = 0; binder_inner_proc_lock(proc); - for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) + for (n = rb_first(&proc->threads); n; n = rb_next(n)) count++; list_for_each_entry(thread, &proc->waiting_threads, waiting_thread_node) @@ -6972,7 +7019,7 @@ static void print_binder_proc_stats(struct seq_file *m, ready_threads, free_async_space); count = 0; - for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) + for (n = rb_first(&proc->nodes); n; n = rb_next(n)) count++; binder_inner_proc_unlock(proc); seq_printf(m, " nodes: %d\n", count); @@ -6980,7 +7027,7 @@ static void print_binder_proc_stats(struct seq_file *m, strong = 0; weak = 0; binder_proc_lock(proc); - for (n = rb_first(&proc->refs_by_desc); n != NULL; n = rb_next(n)) { + for (n = rb_first(&proc->refs_by_desc); n; n = rb_next(n)) { struct binder_ref *ref = rb_entry(n, struct binder_ref, rb_node_desc); count++; @@ -7007,7 +7054,7 @@ static void print_binder_proc_stats(struct seq_file *m, print_binder_stats(m, " ", &proc->stats); } -static int state_show(struct seq_file *m, void *unused) +static void print_binder_state(struct seq_file *m, bool hash_ptrs) { struct binder_proc *proc; struct binder_node *node; @@ -7018,31 +7065,40 @@ static int state_show(struct seq_file *m, void *unused) spin_lock(&binder_dead_nodes_lock); if (!hlist_empty(&binder_dead_nodes)) seq_puts(m, "dead nodes:\n"); - hlist_for_each_entry(node, &binder_dead_nodes, dead_node) { - /* - * take a temporary reference on the node so it - * survives and isn't removed from the list - * while we print it. - */ - node->tmp_refs++; - spin_unlock(&binder_dead_nodes_lock); - if (last_node) - binder_put_node(last_node); - binder_node_lock(node); - print_binder_node_nilocked(m, node); - binder_node_unlock(node); - last_node = node; - spin_lock(&binder_dead_nodes_lock); - } + hlist_for_each_entry(node, &binder_dead_nodes, dead_node) + last_node = print_next_binder_node_ilocked(m, NULL, node, + last_node, + hash_ptrs); spin_unlock(&binder_dead_nodes_lock); if (last_node) binder_put_node(last_node); mutex_lock(&binder_procs_lock); hlist_for_each_entry(proc, &binder_procs, proc_node) - print_binder_proc(m, proc, 1); + print_binder_proc(m, proc, true, hash_ptrs); mutex_unlock(&binder_procs_lock); +} +static void print_binder_transactions(struct seq_file *m, bool hash_ptrs) +{ + struct binder_proc *proc; + + seq_puts(m, "binder transactions:\n"); + mutex_lock(&binder_procs_lock); + hlist_for_each_entry(proc, &binder_procs, proc_node) + print_binder_proc(m, proc, false, hash_ptrs); + mutex_unlock(&binder_procs_lock); +} + +static int state_show(struct seq_file *m, void *unused) +{ + print_binder_state(m, false); + return 0; +} + +static int state_hashed_show(struct seq_file *m, void *unused) +{ + print_binder_state(m, true); return 0; } @@ -7064,14 +7120,13 @@ static int stats_show(struct seq_file *m, void *unused) static int transactions_show(struct seq_file *m, void *unused) { - struct binder_proc *proc; - - seq_puts(m, "binder transactions:\n"); - mutex_lock(&binder_procs_lock); - hlist_for_each_entry(proc, &binder_procs, proc_node) - print_binder_proc(m, proc, 0); - mutex_unlock(&binder_procs_lock); + print_binder_transactions(m, false); + return 0; +} +static int transactions_hashed_show(struct seq_file *m, void *unused) +{ + print_binder_transactions(m, true); return 0; } @@ -7084,7 +7139,7 @@ static int proc_show(struct seq_file *m, void *unused) hlist_for_each_entry(itr, &binder_procs, proc_node) { if (itr->pid == pid) { seq_puts(m, "binder proc state:\n"); - print_binder_proc(m, itr, 1); + print_binder_proc(m, itr, true, false); } } mutex_unlock(&binder_procs_lock); @@ -7151,8 +7206,10 @@ const struct file_operations binder_fops = { }; DEFINE_SHOW_ATTRIBUTE(state); +DEFINE_SHOW_ATTRIBUTE(state_hashed); DEFINE_SHOW_ATTRIBUTE(stats); DEFINE_SHOW_ATTRIBUTE(transactions); +DEFINE_SHOW_ATTRIBUTE(transactions_hashed); DEFINE_SHOW_ATTRIBUTE(transaction_log); const struct binder_debugfs_entry binder_debugfs_entries[] = { @@ -7162,6 +7219,12 @@ const struct binder_debugfs_entry binder_debugfs_entries[] = { .fops = &state_fops, .data = NULL, }, + { + .name = "state_hashed", + .mode = 0444, + .fops = &state_hashed_fops, + .data = NULL, + }, { .name = "stats", .mode = 0444, @@ -7174,6 +7237,12 @@ const struct binder_debugfs_entry binder_debugfs_entries[] = { .fops = &transactions_fops, .data = NULL, }, + { + .name = "transactions_hashed", + .mode = 0444, + .fops = &transactions_hashed_fops, + .data = NULL, + }, { .name = "transaction_log", .mode = 0444, diff --git a/drivers/android/vendor_hooks.c b/drivers/android/vendor_hooks.c index 82be9bd97eb3..eaea41831d1f 100644 --- a/drivers/android/vendor_hooks.c +++ b/drivers/android/vendor_hooks.c @@ -490,6 +490,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_lruvec_add_folio); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_lruvec_del_folio); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_add_lazyfree_bypass); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_do_async_mmap_readahead); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mm_free_page); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_update_page_mapcount); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_add_page_to_lrulist); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_del_page_from_lrulist); @@ -676,3 +677,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_filemap_fault_pre_folio_locked); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_filemap_folio_mapped); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_folio_remove_rmap_ptes); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_pageset_update); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_xhci_full_reset_on_remove); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mempool_alloc_skip_wait); diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 618c31e17833..20975a271b13 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1002,7 +1002,7 @@ static enum hrtimer_restart pm_suspend_timer_fn(struct hrtimer *timer) * If 'expires' is after the current time, we've been called * too early. */ - if (expires > 0 && expires < ktime_get_mono_fast_ns()) { + if (expires > 0 && expires <= ktime_get_mono_fast_ns()) { dev->power.timer_expires = 0; rpm_suspend(dev, dev->power.timer_autosuspends ? (RPM_ASYNC | RPM_AUTO) : RPM_ASYNC); diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-kvm.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-kvm.c index 4d32b9d5320e..315b1b17aaa0 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-kvm.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-kvm.c @@ -284,15 +284,13 @@ static int kvm_arm_smmu_domain_finalize(struct kvm_arm_smmu_domain *kvm_smmu_dom return 0; } - kvm_smmu_domain->smmu = smmu; - if (kvm_smmu_domain->domain.type == IOMMU_DOMAIN_IDENTITY) { kvm_smmu_domain->id = KVM_IOMMU_DOMAIN_IDMAP_ID; /* * Identity domains doesn't use the DMA API, so no need to * set the domain aperture. */ - return 0; + goto out; } /* Default to stage-1. */ @@ -325,7 +323,13 @@ static int kvm_arm_smmu_domain_finalize(struct kvm_arm_smmu_domain *kvm_smmu_dom ret = kvm_call_hyp_nvhe_mc(__pkvm_host_iommu_alloc_domain, kvm_smmu_domain->id, kvm_smmu_domain->type); + if (ret) { + ida_free(&kvm_arm_smmu_domain_ida, kvm_smmu_domain->id); + return ret; + } +out: + kvm_smmu_domain->smmu = smmu; return ret; } diff --git a/drivers/ufs/core/ufs-mcq.c b/drivers/ufs/core/ufs-mcq.c index c673118cc8c8..43703184cff4 100644 --- a/drivers/ufs/core/ufs-mcq.c +++ b/drivers/ufs/core/ufs-mcq.c @@ -629,7 +629,6 @@ int ufshcd_mcq_abort(struct scsi_cmnd *cmd) int tag = scsi_cmd_to_rq(cmd)->tag; struct ufshcd_lrb *lrbp = &hba->lrb[tag]; struct ufs_hw_queue *hwq; - unsigned long flags; int err; /* Skip task abort in case previous aborts failed and report failure */ @@ -668,10 +667,5 @@ int ufshcd_mcq_abort(struct scsi_cmnd *cmd) return FAILED; } - spin_lock_irqsave(&hwq->cq_lock, flags); - if (ufshcd_cmd_inflight(lrbp->cmd)) - ufshcd_release_scsi_cmd(hba, lrbp); - spin_unlock_irqrestore(&hwq->cq_lock, flags); - return SUCCESS; } diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index c32950b55966..b21d96365b65 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -6545,9 +6545,14 @@ static void ufshcd_err_handler(struct work_struct *work) up(&hba->host_sem); return; } + spin_unlock_irqrestore(hba->host->host_lock, flags); + + ufshcd_err_handling_prepare(hba); + + spin_lock_irqsave(hba->host->host_lock, flags); ufshcd_set_eh_in_progress(hba); spin_unlock_irqrestore(hba->host->host_lock, flags); - ufshcd_err_handling_prepare(hba); + /* Complete requests that have door-bell cleared by h/w */ ufshcd_complete_requests(hba, false); spin_lock_irqsave(hba->host->host_lock, flags); diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index 4e388c78ee5a..7fb04cf6659c 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "xhci.h" #include "xhci-trace.h" @@ -196,6 +197,7 @@ int xhci_reset(struct xhci_hcd *xhci, u64 timeout_us) u32 command; u32 state; int ret; + bool full_reset = 0; state = readl(&xhci->op_regs->status); @@ -224,8 +226,11 @@ int xhci_reset(struct xhci_hcd *xhci, u64 timeout_us) if (xhci->quirks & XHCI_INTEL_HOST) udelay(1000); + trace_android_vh_xhci_full_reset_on_remove(&full_reset); + ret = xhci_handshake_check_state(xhci, &xhci->op_regs->command, - CMD_RESET, 0, timeout_us, XHCI_STATE_REMOVING); + CMD_RESET, 0, timeout_us, + full_reset ? 0 : XHCI_STATE_REMOVING); if (ret) return ret; diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index b65fc70e4033..480e5d3b7b7e 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -547,6 +547,14 @@ struct pd_rx_event { struct pd_message msg; }; +struct altmode_vdm_event { + struct kthread_work work; + struct tcpm_port *port; + u32 header; + u32 *data; + int cnt; +}; + static const char * const pd_rev[] = { [PD_REV10] = "rev1", [PD_REV20] = "rev2", @@ -1531,14 +1539,66 @@ static void tcpm_queue_vdm(struct tcpm_port *port, const u32 header, mod_vdm_delayed_work(port, 0); } -static void tcpm_queue_vdm_unlocked(struct tcpm_port *port, const u32 header, - const u32 *data, int cnt) +static void tcpm_queue_vdm_work(struct kthread_work *work) { + struct altmode_vdm_event *event = container_of(work, + struct altmode_vdm_event, + work); + struct tcpm_port *port = event->port; + mutex_lock(&port->lock); - tcpm_queue_vdm(port, header, data, cnt); + if (port->state != SRC_READY && port->state != SNK_READY) { + tcpm_log_force(port, "dropping altmode_vdm_event"); + goto port_unlock; + } + + tcpm_queue_vdm(port, event->header, event->data, event->cnt); + +port_unlock: + kfree(event->data); + kfree(event); mutex_unlock(&port->lock); } +static int tcpm_queue_vdm_unlocked(struct tcpm_port *port, const u32 header, + const u32 *data, int cnt) +{ + struct altmode_vdm_event *event; + u32 *data_cpy; + int ret = -ENOMEM; + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) + goto err_event; + + data_cpy = kcalloc(cnt, sizeof(u32), GFP_KERNEL); + if (!data_cpy) + goto err_data; + + kthread_init_work(&event->work, tcpm_queue_vdm_work); + event->port = port; + event->header = header; + memcpy(data_cpy, data, sizeof(u32) * cnt); + event->data = data_cpy; + event->cnt = cnt; + + ret = kthread_queue_work(port->wq, &event->work); + if (!ret) { + ret = -EBUSY; + goto err_queue; + } + + return 0; + +err_queue: + kfree(data_cpy); +err_data: + kfree(event); +err_event: + tcpm_log_force(port, "failed to queue altmode vdm, err:%d", ret); + return ret; +} + static void svdm_consume_identity(struct tcpm_port *port, const u32 *p, int cnt) { u32 vdo = p[VDO_INDEX_IDH]; @@ -2297,8 +2357,7 @@ static int tcpm_altmode_enter(struct typec_altmode *altmode, u32 *vdo) header = VDO(altmode->svid, vdo ? 2 : 1, svdm_version, CMD_ENTER_MODE); header |= VDO_OPOS(altmode->mode); - tcpm_queue_vdm_unlocked(port, header, vdo, vdo ? 1 : 0); - return 0; + return tcpm_queue_vdm_unlocked(port, header, vdo, vdo ? 1 : 0); } static int tcpm_altmode_exit(struct typec_altmode *altmode) @@ -2314,8 +2373,7 @@ static int tcpm_altmode_exit(struct typec_altmode *altmode) header = VDO(altmode->svid, 1, svdm_version, CMD_EXIT_MODE); header |= VDO_OPOS(altmode->mode); - tcpm_queue_vdm_unlocked(port, header, NULL, 0); - return 0; + return tcpm_queue_vdm_unlocked(port, header, NULL, 0); } static int tcpm_altmode_vdm(struct typec_altmode *altmode, @@ -2323,9 +2381,7 @@ static int tcpm_altmode_vdm(struct typec_altmode *altmode, { struct tcpm_port *port = typec_altmode_get_drvdata(altmode); - tcpm_queue_vdm_unlocked(port, header, data, count - 1); - - return 0; + return tcpm_queue_vdm_unlocked(port, header, data, count - 1); } static const struct typec_altmode_ops tcpm_altmode_ops = { diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 1d9b2a61ca63..b0f8b18ff491 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -336,6 +336,7 @@ static struct workqueue_struct *z_erofs_workqueue __read_mostly; #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD static struct kthread_worker __rcu **z_erofs_pcpu_workers; +static atomic_t erofs_percpu_workers_initialized = ATOMIC_INIT(0); static void erofs_destroy_percpu_workers(void) { @@ -381,12 +382,8 @@ static int erofs_init_percpu_workers(void) } return 0; } -#else -static inline void erofs_destroy_percpu_workers(void) {} -static inline int erofs_init_percpu_workers(void) { return 0; } -#endif -#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_EROFS_FS_PCPU_KTHREAD) +#ifdef CONFIG_HOTPLUG_CPU static DEFINE_SPINLOCK(z_erofs_pcpu_worker_lock); static enum cpuhp_state erofs_cpuhp_state; @@ -443,15 +440,53 @@ static void erofs_cpu_hotplug_destroy(void) if (erofs_cpuhp_state) cpuhp_remove_state_nocalls(erofs_cpuhp_state); } -#else /* !CONFIG_HOTPLUG_CPU || !CONFIG_EROFS_FS_PCPU_KTHREAD */ +#else /* !CONFIG_HOTPLUG_CPU */ static inline int erofs_cpu_hotplug_init(void) { return 0; } static inline void erofs_cpu_hotplug_destroy(void) {} -#endif +#endif/* CONFIG_HOTPLUG_CPU */ +static int z_erofs_init_pcpu_workers(struct super_block *sb) +{ + int err; + + if (atomic_xchg(&erofs_percpu_workers_initialized, 1)) + return 0; + + err = erofs_init_percpu_workers(); + if (err) { + erofs_err(sb, "per-cpu workers: failed to allocate."); + goto err_init_percpu_workers; + } + + err = erofs_cpu_hotplug_init(); + if (err < 0) { + erofs_err(sb, "per-cpu workers: failed CPU hotplug init."); + goto err_cpuhp_init; + } + erofs_info(sb, "initialized per-cpu workers successfully."); + return err; + +err_cpuhp_init: + erofs_destroy_percpu_workers(); +err_init_percpu_workers: + atomic_set(&erofs_percpu_workers_initialized, 0); + return err; +} + +static void z_erofs_destroy_pcpu_workers(void) +{ + if (!atomic_xchg(&erofs_percpu_workers_initialized, 0)) + return; + erofs_cpu_hotplug_destroy(); + erofs_destroy_percpu_workers(); +} +#else /* !CONFIG_EROFS_FS_PCPU_KTHREAD */ +static inline int z_erofs_init_pcpu_workers(struct super_block *sb) { return 0; } +static inline void z_erofs_destroy_pcpu_workers(void) {} +#endif/* CONFIG_EROFS_FS_PCPU_KTHREAD */ void z_erofs_exit_zip_subsystem(void) { - erofs_cpu_hotplug_destroy(); - erofs_destroy_percpu_workers(); + z_erofs_destroy_pcpu_workers(); destroy_workqueue(z_erofs_workqueue); z_erofs_destroy_pcluster_pool(); } @@ -467,23 +502,12 @@ int __init z_erofs_init_zip_subsystem(void) WQ_UNBOUND | WQ_HIGHPRI, num_possible_cpus()); if (!z_erofs_workqueue) { err = -ENOMEM; - goto out_error_workqueue_init; + goto out_err_workqueue_init; } - err = erofs_init_percpu_workers(); - if (err) - goto out_error_pcpu_worker; - - err = erofs_cpu_hotplug_init(); - if (err < 0) - goto out_error_cpuhp_init; return err; -out_error_cpuhp_init: - erofs_destroy_percpu_workers(); -out_error_pcpu_worker: - destroy_workqueue(z_erofs_workqueue); -out_error_workqueue_init: +out_err_workqueue_init: z_erofs_destroy_pcluster_pool(); out_error_pcluster_pool: return err; @@ -711,8 +735,14 @@ static const struct address_space_operations z_erofs_cache_aops = { int erofs_init_managed_cache(struct super_block *sb) { - struct inode *const inode = new_inode(sb); + struct inode *inode; + int err; + err = z_erofs_init_pcpu_workers(sb); + if (err) + return err; + + inode = new_inode(sb); if (!inode) return -ENOMEM; diff --git a/fs/fuse/backing.c b/fs/fuse/backing.c index 5cb9fb6086a1..21fb9cdd20aa 100644 --- a/fs/fuse/backing.c +++ b/fs/fuse/backing.c @@ -799,6 +799,10 @@ int fuse_file_read_iter_initialize( .size = to->count, }; + fri->frio = (struct fuse_read_iter_out) { + .ret = fri->fri.size, + }; + /* TODO we can't assume 'to' is a kvec */ /* TODO we also can't assume the vector has only one component */ *fa = (struct fuse_bpf_args) { @@ -833,6 +837,11 @@ int fuse_file_read_iter_backing(struct fuse_bpf_args *fa, if (!iov_iter_count(to)) return 0; + if ((iocb->ki_flags & IOCB_DIRECT) && + (!ff->backing_file->f_mapping->a_ops || + !ff->backing_file->f_mapping->a_ops->direct_IO)) + return -EINVAL; + /* TODO This just plain ignores any change to fuse_read_in */ if (is_sync_kiocb(iocb)) { ret = vfs_iter_read(ff->backing_file, to, &iocb->ki_pos, @@ -855,13 +864,14 @@ int fuse_file_read_iter_backing(struct fuse_bpf_args *fa, fuse_bpf_aio_cleanup_handler(aio_req); } + frio->ret = ret; + /* TODO Need to point value at the buffer for post-modification */ out: fuse_file_accessed(file, ff->backing_file); - frio->ret = ret; - return ret < 0 ? ret : 0; + return ret; } void *fuse_file_read_iter_finalize(struct fuse_bpf_args *fa, diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 87c57b18b899..78320acce7b0 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -41,6 +41,24 @@ struct poll_table_struct; /* define the enumeration of all cgroup subsystems */ #define SUBSYS(_x) _x ## _cgrp_id, + +#define CSS_COUNTERS_SIZE (CGROUP_SUBSYS_COUNT * sizeof(atomic_t)) + +/* + * This should just use max(), but max() doesn't work in struct definitions. + * + * Originally, the space was reserved for per cgroup subsystem counters, where each counter was + * the size of an atomic_t variable. However, it was later reused to fit a struct rcu_head + * which is why the calculation considers the size of struct rcu_head. + * + * This macro is provided to ANDROID_BACKPORT_USE_ARRAY() which needs to reserve at least + * enough memory to accommodate struct rcu_head. However, if we only reserve CSS_COUNTERS_SIZE, + * that may not be enough space on kernels with a small amount of cgroup subsystems enabled. So, + * we take the max between the two values to use in ANDROID_BACKPORT_USE_ARRAY(). + */ +#define CGROUP_ROOT_BACKPORT_PADDING_SIZE \ + (CSS_COUNTERS_SIZE > sizeof(struct rcu_head) ? CSS_COUNTERS_SIZE : sizeof(struct rcu_head)) + enum cgroup_subsys_id { #include CGROUP_SUBSYS_COUNT, @@ -585,8 +603,12 @@ struct cgroup_root { /* The name for this hierarchy - may be empty */ char name[MAX_CGROUP_ROOT_NAMELEN]; - ANDROID_BACKPORT_USE_ARRAY(1, CGROUP_SUBSYS_COUNT * sizeof(atomic_t), - struct rcu_head rcu); + /* Use the original calculation to preserve the CRC value for the ABI. */ +#ifndef __GENKSYMS__ + ANDROID_BACKPORT_USE_ARRAY(1, CGROUP_ROOT_BACKPORT_PADDING_SIZE, struct rcu_head rcu); +#else + ANDROID_BACKPORT_USE_ARRAY(1, CGROUP_SUBSYS_COUNT * sizeof(atomic_t), struct rcu_head rcu); +#endif }; /* diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 46d28cbe7171..f58c44dfc06b 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -277,15 +277,25 @@ struct mthp_stat { #ifdef CONFIG_SYSFS DECLARE_PER_CPU(struct mthp_stat, mthp_stats); -static inline void count_mthp_stat(int order, enum mthp_stat_item item) +static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta) { if (order <= 0 || order > PMD_ORDER) return; - this_cpu_inc(mthp_stats.stats[order][item]); + this_cpu_add(mthp_stats.stats[order][item], delta); } + +static inline void count_mthp_stat(int order, enum mthp_stat_item item) +{ + mod_mthp_stat(order, item, 1); +} + unsigned long sum_mthp_stat(int order, enum mthp_stat_item item); #else +static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta) +{ +} + static inline void count_mthp_stat(int order, enum mthp_stat_item item) { } @@ -326,7 +336,7 @@ static inline int split_huge_page(struct page *page) { return split_huge_page_to_list(page, NULL); } -void deferred_split_folio(struct folio *folio); +void deferred_split_folio(struct folio *folio, bool partially_mapped); void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct folio *folio); @@ -486,7 +496,7 @@ static inline int split_huge_page(struct page *page) { return 0; } -static inline void deferred_split_folio(struct folio *folio) {} +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index f68865e19b0b..30baae91b225 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -4,6 +4,7 @@ #include /* MMF_VM_HUGEPAGE */ +extern unsigned int khugepaged_max_ptes_none __read_mostly; #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern struct attribute_group khugepaged_attr_group; diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 110e699ef494..3d400a0f2118 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -731,8 +731,15 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list) __mem_cgroup_uncharge_list(page_list); } -void mem_cgroup_replace_folio(struct folio *old, struct folio *new); +void __mem_cgroup_uncharge_folios(struct folio_batch *folios); +static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) +{ + if (mem_cgroup_disabled()) + return; + __mem_cgroup_uncharge_folios(folios); +} +void mem_cgroup_replace_folio(struct folio *old, struct folio *new); void mem_cgroup_migrate(struct folio *old, struct folio *new); /** @@ -1171,6 +1178,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); +extern int mem_cgroup_init(void); #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 @@ -1297,6 +1305,10 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list) { } +static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) +{ +} + static inline void mem_cgroup_replace_folio(struct folio *old, struct folio *new) { @@ -1619,6 +1631,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, { return 0; } + +static inline int mem_cgroup_init(void) { return 0; } #endif /* CONFIG_MEMCG */ static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx) @@ -1682,18 +1696,18 @@ static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio, return folio_lruvec_lock_irq(folio); } -/* Don't lock again iff page's lruvec locked */ -static inline struct lruvec *folio_lruvec_relock_irqsave(struct folio *folio, - struct lruvec *locked_lruvec, unsigned long *flags) +/* Don't lock again iff folio's lruvec locked */ +static inline void folio_lruvec_relock_irqsave(struct folio *folio, + struct lruvec **lruvecp, unsigned long *flags) { - if (locked_lruvec) { - if (folio_matches_lruvec(folio, locked_lruvec)) - return locked_lruvec; + if (*lruvecp) { + if (folio_matches_lruvec(folio, *lruvecp)) + return; - unlock_page_lruvec_irqrestore(locked_lruvec, *flags); + unlock_page_lruvec_irqrestore(*lruvecp, *flags); } - return folio_lruvec_lock_irqsave(folio, flags); + *lruvecp = folio_lruvec_lock_irqsave(folio, flags); } #ifdef CONFIG_CGROUP_WRITEBACK diff --git a/include/linux/mm.h b/include/linux/mm.h index e6d5be7a3e92..c2422ab80c29 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -39,6 +39,7 @@ struct anon_vma; struct anon_vma_chain; struct user_struct; struct pt_regs; +struct folio_batch; extern int sysctl_page_lock_unfairness; @@ -1539,6 +1540,8 @@ static inline void folio_put_refs(struct folio *folio, int refs) __folio_put(folio); } +void folios_put_refs(struct folio_batch *folios, unsigned int *refs); + /* * union release_pages_arg - an array of pages or folios * @@ -1561,18 +1564,19 @@ void release_pages(release_pages_arg, int nr); /** * folios_put - Decrement the reference count on an array of folios. * @folios: The folios. - * @nr: How many folios there are. * - * Like folio_put(), but for an array of folios. This is more efficient - * than writing the loop yourself as it will optimise the locks which - * need to be taken if the folios are freed. + * Like folio_put(), but for a batch of folios. This is more efficient + * than writing the loop yourself as it will optimise the locks which need + * to be taken if the folios are freed. The folios batch is returned + * empty and ready to be reused for another batch; there is no need to + * reinitialise it. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock. */ -static inline void folios_put(struct folio **folios, unsigned int nr) +static inline void folios_put(struct folio_batch *folios) { - release_pages(folios, nr); + folios_put_refs(folios, NULL); } static inline void put_page(struct page *page) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3311757c9d98..8d7e93a8437b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -37,6 +37,22 @@ #define NR_PAGE_ORDERS (MAX_ORDER + 1) +/* Defines the order for the number of pages that have a migrate type. */ +#ifndef CONFIG_PAGE_BLOCK_ORDER +#define PAGE_BLOCK_ORDER MAX_ORDER +#else +#define PAGE_BLOCK_ORDER CONFIG_PAGE_BLOCK_ORDER +#endif /* CONFIG_PAGE_BLOCK_ORDER */ + +/* + * The MAX_ORDER, which defines the max order of pages to be allocated + * by the buddy allocator, has to be larger or equal to the PAGE_BLOCK_ORDER, + * which defines the order for the number of pages that can have a migrate type + */ +#if (PAGE_BLOCK_ORDER > MAX_ORDER) +#error MAX_ORDER must be >= PAGE_BLOCK_ORDER +#endif + /* * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed * costly to service. That is between allocation orders which should diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 618afa3d40b5..11cfd3bde50e 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -197,6 +197,7 @@ enum pageflags { /* At least one page in this folio has the hwpoison flag set */ PG_has_hwpoisoned = PG_error, PG_large_rmappable = PG_workingset, /* anon or file-backed */ + PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */ }; #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1) @@ -372,54 +373,77 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n) #define FOLIO_PF_NO_COMPOUND 0 #define FOLIO_PF_SECOND 1 +#define FOLIO_HEAD_PAGE 0 +#define FOLIO_SECOND_PAGE 1 + /* * Macros to create function definitions for page flags */ +#define FOLIO_TEST_FLAG(name, page) \ +static __always_inline bool folio_test_##name(struct folio *folio) \ +{ return test_bit(PG_##name, folio_flags(folio, page)); } + +#define FOLIO_SET_FLAG(name, page) \ +static __always_inline void folio_set_##name(struct folio *folio) \ +{ set_bit(PG_##name, folio_flags(folio, page)); } + +#define FOLIO_CLEAR_FLAG(name, page) \ +static __always_inline void folio_clear_##name(struct folio *folio) \ +{ clear_bit(PG_##name, folio_flags(folio, page)); } + +#define __FOLIO_SET_FLAG(name, page) \ +static __always_inline void __folio_set_##name(struct folio *folio) \ +{ __set_bit(PG_##name, folio_flags(folio, page)); } + +#define __FOLIO_CLEAR_FLAG(name, page) \ +static __always_inline void __folio_clear_##name(struct folio *folio) \ +{ __clear_bit(PG_##name, folio_flags(folio, page)); } + +#define FOLIO_TEST_SET_FLAG(name, page) \ +static __always_inline bool folio_test_set_##name(struct folio *folio) \ +{ return test_and_set_bit(PG_##name, folio_flags(folio, page)); } + +#define FOLIO_TEST_CLEAR_FLAG(name, page) \ +static __always_inline bool folio_test_clear_##name(struct folio *folio) \ +{ return test_and_clear_bit(PG_##name, folio_flags(folio, page)); } + +#define FOLIO_FLAG(name, page) \ +FOLIO_TEST_FLAG(name, page) \ +FOLIO_SET_FLAG(name, page) \ +FOLIO_CLEAR_FLAG(name, page) + #define TESTPAGEFLAG(uname, lname, policy) \ -static __always_inline bool folio_test_##lname(struct folio *folio) \ -{ return test_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ +FOLIO_TEST_FLAG(lname, FOLIO_##policy) \ static __always_inline int Page##uname(struct page *page) \ { return test_bit(PG_##lname, &policy(page, 0)->flags); } #define SETPAGEFLAG(uname, lname, policy) \ -static __always_inline \ -void folio_set_##lname(struct folio *folio) \ -{ set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ +FOLIO_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline void SetPage##uname(struct page *page) \ { set_bit(PG_##lname, &policy(page, 1)->flags); } #define CLEARPAGEFLAG(uname, lname, policy) \ -static __always_inline \ -void folio_clear_##lname(struct folio *folio) \ -{ clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ +FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline void ClearPage##uname(struct page *page) \ { clear_bit(PG_##lname, &policy(page, 1)->flags); } #define __SETPAGEFLAG(uname, lname, policy) \ -static __always_inline \ -void __folio_set_##lname(struct folio *folio) \ -{ __set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ +__FOLIO_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline void __SetPage##uname(struct page *page) \ { __set_bit(PG_##lname, &policy(page, 1)->flags); } #define __CLEARPAGEFLAG(uname, lname, policy) \ -static __always_inline \ -void __folio_clear_##lname(struct folio *folio) \ -{ __clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ +__FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline void __ClearPage##uname(struct page *page) \ { __clear_bit(PG_##lname, &policy(page, 1)->flags); } #define TESTSETFLAG(uname, lname, policy) \ -static __always_inline \ -bool folio_test_set_##lname(struct folio *folio) \ -{ return test_and_set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ +FOLIO_TEST_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline int TestSetPage##uname(struct page *page) \ { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); } #define TESTCLEARFLAG(uname, lname, policy) \ -static __always_inline \ -bool folio_test_clear_##lname(struct folio *folio) \ -{ return test_and_clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \ +FOLIO_TEST_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline int TestClearPage##uname(struct page *page) \ { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } @@ -842,8 +866,18 @@ static inline void ClearPageCompound(struct page *page) ClearPageHead(page); } PAGEFLAG(LargeRmappable, large_rmappable, PF_SECOND) +FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE) +/* + * PG_partially_mapped is protected by deferred_split split_queue_lock, + * so its safe to use non-atomic set/clear. + */ +__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE) +__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE) #else TESTPAGEFLAG_FALSE(LargeRmappable, large_rmappable) +FOLIO_TEST_FLAG_FALSE(partially_mapped) +__FOLIO_SET_FLAG_NOOP(partially_mapped) +__FOLIO_CLEAR_FLAG_NOOP(partially_mapped) #endif #define PG_head_mask ((1UL << PG_head)) @@ -1111,7 +1145,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) */ #define PAGE_FLAGS_SECOND \ (0xffUL /* order */ | 1UL << PG_has_hwpoisoned | \ - 1UL << PG_large_rmappable) + 1UL << PG_large_rmappable | 1UL << PG_partially_mapped) #define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index c16db0067090..73dc2c1841ec 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -3,10 +3,6 @@ #define __LINUX_PAGEISOLATION_H #ifdef CONFIG_MEMORY_ISOLATION -static inline bool has_isolate_pageblock(struct zone *zone) -{ - return zone->nr_isolate_pageblock; -} static inline bool is_migrate_isolate_page(struct page *page) { return get_pageblock_migratetype(page) == MIGRATE_ISOLATE; @@ -16,10 +12,6 @@ static inline bool is_migrate_isolate(int migratetype) return migratetype == MIGRATE_ISOLATE; } #else -static inline bool has_isolate_pageblock(struct zone *zone) -{ - return false; -} static inline bool is_migrate_isolate_page(struct page *page) { return false; diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index e83c4c095041..1791c2fdbbb9 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -28,7 +28,7 @@ enum pageblock_bits { NR_PAGEBLOCK_BITS }; -#ifdef CONFIG_HUGETLB_PAGE +#if defined(CONFIG_HUGETLB_PAGE) #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE @@ -41,14 +41,18 @@ extern unsigned int pageblock_order; * Huge pages are a constant size, but don't exceed the maximum allocation * granularity. */ -#define pageblock_order min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_ORDER) +#define pageblock_order min_t(unsigned int, HUGETLB_PAGE_ORDER, PAGE_BLOCK_ORDER) #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ -#else /* CONFIG_HUGETLB_PAGE */ +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE) + +#define pageblock_order min_t(unsigned int, HPAGE_PMD_ORDER, PAGE_BLOCK_ORDER) + +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ /* If huge pages are not used, group by MAX_ORDER_NR_PAGES */ -#define pageblock_order MAX_ORDER +#define pageblock_order PAGE_BLOCK_ORDER #endif /* CONFIG_HUGETLB_PAGE */ diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 30491e14c349..ce92efef05b6 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -742,7 +742,12 @@ int folio_mkclean(struct folio *); int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, struct vm_area_struct *vma); -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked); +enum rmp_flags { + RMP_LOCKED = 1 << 0, + RMP_USE_SHARED_ZEROPAGE = 1 << 1, +}; + +void remove_migration_ptes(struct folio *src, struct folio *dst, int flags); int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); diff --git a/include/linux/trace.h b/include/linux/trace.h index bb4d84f1c58c..59e43df84f78 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -52,6 +52,8 @@ int trace_array_printk(struct trace_array *tr, unsigned long ip, int trace_array_init_printk(struct trace_array *tr); void trace_array_put(struct trace_array *tr); struct trace_array *trace_array_get_by_name(const char *name); +struct trace_array *trace_array_get_by_name_ext(const char *name, + const char *systems); int trace_array_destroy(struct trace_array *tr); /* For osnoise tracer */ @@ -88,6 +90,11 @@ static inline struct trace_array *trace_array_get_by_name(const char *name) { return NULL; } +static inline struct trace_array *trace_array_get_by_name_ext( + const char *name, const char *systems) +{ + return NULL; +} static inline int trace_array_destroy(struct trace_array *tr) { return 0; diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 7a00d7ed527b..a320b08230b5 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -8,21 +8,46 @@ #include #include -void unix_inflight(struct user_struct *user, struct file *fp); -void unix_notinflight(struct user_struct *user, struct file *fp); -void unix_destruct_scm(struct sk_buff *skb); -void io_uring_destruct_scm(struct sk_buff *skb); -void unix_gc(void); -void wait_for_unix_gc(void); +#if IS_ENABLED(CONFIG_UNIX) struct unix_sock *unix_get_socket(struct file *filp); +#else +static inline struct unix_sock *unix_get_socket(struct file *filp) +{ + return NULL; +} +#endif + +extern unsigned int unix_tot_inflight; +void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver); +void unix_del_edges(struct scm_fp_list *fpl); +void unix_update_edges(struct unix_sock *receiver); +int unix_prepare_fpl(struct scm_fp_list *fpl); +void unix_destroy_fpl(struct scm_fp_list *fpl); +void unix_gc(void); +void wait_for_unix_gc(struct scm_fp_list *fpl); + +struct unix_vertex { + struct list_head edges; + struct list_head entry; + struct list_head scc_entry; + unsigned long out_degree; + unsigned long index; + unsigned long scc_index; +}; + +struct unix_edge { + struct unix_sock *predecessor; + struct unix_sock *successor; + struct list_head vertex_entry; + struct list_head stack_entry; +}; + struct sock *unix_peer_get(struct sock *sk); #define UNIX_HASH_MOD (256 - 1) #define UNIX_HASH_SIZE (256 * 2) #define UNIX_HASH_BITS 8 -extern unsigned int unix_tot_inflight; - struct unix_address { refcount_t refcnt; int len; @@ -42,6 +67,7 @@ struct unix_skb_parms { struct scm_stat { atomic_t nr_fds; + unsigned long nr_unix_fds; }; #define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb)) @@ -54,12 +80,9 @@ struct unix_sock { struct path path; struct mutex iolock, bindlock; struct sock *peer; - struct list_head link; - unsigned long inflight; + struct unix_vertex *vertex; + struct sock *listener; spinlock_t lock; - unsigned long gc_flags; -#define UNIX_GC_CANDIDATE 0 -#define UNIX_GC_MAYBE_CYCLE 1 struct socket_wq peer_wq; wait_queue_entry_t peer_wake; struct scm_stat scm_stat; diff --git a/include/net/scm.h b/include/net/scm.h index e8c76b4be2fe..bdb2639f6bbf 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -22,11 +22,24 @@ struct scm_creds { kgid_t gid; }; +#ifdef CONFIG_UNIX +struct unix_edge; +#endif + struct scm_fp_list { short count; short max; struct user_struct *user; struct file *fp[SCM_MAX_FD]; +#ifndef __GENKSYMS__ +#ifdef CONFIG_UNIX + bool inflight; + bool dead; + struct list_head vertices; + struct unix_edge *edges; +#endif + short count_unix; +#endif }; struct scm_cookie { diff --git a/include/trace/hooks/mm.h b/include/trace/hooks/mm.h index 5869f672054e..00df4c5ea263 100644 --- a/include/trace/hooks/mm.h +++ b/include/trace/hooks/mm.h @@ -431,6 +431,9 @@ DECLARE_HOOK(android_vh_add_lazyfree_bypass, DECLARE_HOOK(android_vh_do_async_mmap_readahead, TP_PROTO(struct vm_fault *vmf, struct folio *folio, bool *skip), TP_ARGS(vmf, folio, skip)); +DECLARE_HOOK(android_vh_mm_free_page, + TP_PROTO(struct page *page), + TP_ARGS(page)); DECLARE_HOOK(android_vh_cma_debug_show_areas, TP_PROTO(bool *show), @@ -596,6 +599,9 @@ DECLARE_HOOK(android_vh_folio_remove_rmap_ptes, DECLARE_HOOK(android_vh_pageset_update, TP_PROTO(unsigned long *high, unsigned long *batch), TP_ARGS(high, batch)); +DECLARE_HOOK(android_vh_mempool_alloc_skip_wait, + TP_PROTO(gfp_t *gfp_flags, bool *skip_wait), + TP_ARGS(gfp_flags, skip_wait)); #endif /* _TRACE_HOOK_MM_H */ /* This part must be outside protection */ diff --git a/include/trace/hooks/usb.h b/include/trace/hooks/usb.h index 723de57f0da2..560d45213165 100644 --- a/include/trace/hooks/usb.h +++ b/include/trace/hooks/usb.h @@ -31,6 +31,10 @@ DECLARE_HOOK(android_vh_usb_new_device_added, TP_PROTO(struct usb_device *udev, int *err), TP_ARGS(udev, err)); +DECLARE_HOOK(android_vh_xhci_full_reset_on_remove, + TP_PROTO(bool *full_reset), + TP_ARGS(full_reset)); + #endif /* _TRACE_HOOK_USB_H */ /* This part must be outside protection */ #include diff --git a/init/main.c b/init/main.c index c787e94cc898..fdbe5e68c8d3 100644 --- a/init/main.c +++ b/init/main.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -1062,6 +1063,7 @@ void start_kernel(void) proc_root_init(); nsfs_init(); cpuset_init(); + mem_cgroup_init(); cgroup_init(); taskstats_init_early(); delayacct_init(); diff --git a/kernel/fork.c b/kernel/fork.c index 0abd8434b714..75b1a4458a7e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -452,7 +452,7 @@ struct kmem_cache *files_cachep; struct kmem_cache *fs_cachep; /* SLAB cache for vm_area_struct structures */ -static struct kmem_cache *vm_area_cachep; +struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 4283890db87c..7ec81f9d9be5 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -227,6 +227,14 @@ __irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff, void irq_startup_managed(struct irq_desc *desc) { + struct irq_data *d = irq_desc_get_irq_data(desc); + + /* + * Clear managed-shutdown flag, so we don't repeat managed-startup for + * multiple hotplugs, and cause imbalanced disable depth. + */ + irqd_clr_managed_shutdown(d); + /* * Only start it up when the disable depth is 1, so that a disable, * hotunplug, hotplug sequence does not end up enabling it during diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 20067a655e20..1f90f5b6aee5 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -211,13 +211,6 @@ static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu) !irq_data_get_irq_chip(data) || !cpumask_test_cpu(cpu, affinity)) return; - /* - * Don't restore suspended interrupts here when a system comes back - * from S3. They are reenabled via resume_device_irqs(). - */ - if (desc->istate & IRQS_SUSPENDED) - return; - if (irqd_is_managed_and_shutdown(data)) irq_startup_managed(desc); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9ea46d46b3f7..23f7c8afe571 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9538,16 +9538,19 @@ static int trace_array_create_dir(struct trace_array *tr) return ret; } -static struct trace_array *trace_array_create(const char *name) +static struct trace_array * +trace_array_create_systems(const char *name, const char *systems) { + struct trace_array_ext *tr_ext; struct trace_array *tr; int ret; ret = -ENOMEM; - tr = kzalloc(sizeof(*tr), GFP_KERNEL); - if (!tr) + tr_ext = kzalloc(sizeof(*tr_ext), GFP_KERNEL); + if (!tr_ext) return ERR_PTR(ret); + tr = &tr_ext->trace_array; tr->name = kstrdup(name, GFP_KERNEL); if (!tr->name) goto out_free_tr; @@ -9558,6 +9561,12 @@ static struct trace_array *trace_array_create(const char *name) if (!zalloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL)) goto out_free_tr; + if (systems) { + tr_ext->system_names = kstrdup_const(systems, GFP_KERNEL); + if (!tr_ext->system_names) + goto out_free_tr; + } + tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS; cpumask_copy(tr->tracing_cpumask, cpu_all_mask); @@ -9601,12 +9610,18 @@ static struct trace_array *trace_array_create(const char *name) free_trace_buffers(tr); free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); + kfree_const(tr_ext->system_names); kfree(tr->name); - kfree(tr); + kfree(tr_ext); return ERR_PTR(ret); } +static struct trace_array *trace_array_create(const char *name) +{ + return trace_array_create_systems(name, NULL); +} + static int instance_mkdir(const char *name) { struct trace_array *tr; @@ -9629,9 +9644,27 @@ out_unlock: return ret; } +const char *trace_array_get_system_names(struct trace_array *tr) +{ + struct trace_array_ext *tr_ext; + + if (tr == &global_trace) + return NULL; + + tr_ext = container_of(tr, struct trace_array_ext, trace_array); + return tr_ext->system_names; +} + +struct trace_array *trace_array_get_by_name(const char *name) +{ + return trace_array_get_by_name_ext(name, NULL); +} +EXPORT_SYMBOL_GPL(trace_array_get_by_name); + /** - * trace_array_get_by_name - Create/Lookup a trace array, given its name. + * trace_array_get_by_name_ext - Create/Lookup a trace array, given its name. * @name: The name of the trace array to be looked up/created. + * @systems: A list of systems to create event directories for (NULL for all) * * Returns pointer to trace array with given name. * NULL, if it cannot be created. @@ -9645,7 +9678,8 @@ out_unlock: * trace_array_put() is called, user space can not delete it. * */ -struct trace_array *trace_array_get_by_name(const char *name) +struct trace_array *trace_array_get_by_name_ext(const char *name, + const char *systems) { struct trace_array *tr; @@ -9657,7 +9691,7 @@ struct trace_array *trace_array_get_by_name(const char *name) goto out_unlock; } - tr = trace_array_create(name); + tr = trace_array_create_systems(name, systems); if (IS_ERR(tr)) tr = NULL; @@ -9669,11 +9703,14 @@ out_unlock: mutex_unlock(&event_mutex); return tr; } -EXPORT_SYMBOL_GPL(trace_array_get_by_name); +EXPORT_SYMBOL_GPL(trace_array_get_by_name_ext); static int __remove_instance(struct trace_array *tr) { int i; + struct trace_array_ext *tr_ext = container_of(tr, + struct trace_array_ext, + trace_array); /* Reference counter for a newly created trace array = 1. */ if (tr->ref > 1 || (tr->current_trace && tr->trace_ref)) @@ -9704,8 +9741,9 @@ static int __remove_instance(struct trace_array *tr) free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); + kfree_const(tr_ext->system_names); kfree(tr->name); - kfree(tr); + kfree(tr_ext); return 0; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0932c1ad1eab..19a2cf11d86e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -412,6 +412,11 @@ struct trace_array { struct trace_func_repeats __percpu *last_func_repeats; }; +struct trace_array_ext { + const char *system_names; + struct trace_array trace_array; +}; + enum { TRACE_ARRAY_FL_GLOBAL = (1 << 0) }; @@ -420,6 +425,7 @@ extern struct list_head ftrace_trace_arrays; extern struct mutex trace_types_lock; +extern const char *trace_array_get_system_names(struct trace_array *tr); extern int trace_array_get(struct trace_array *tr); extern int tracing_check_open_get_tr(struct trace_array *tr); extern struct trace_array *trace_array_find(const char *instance); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index bf2dde80006b..5f0bd608bf90 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3041,6 +3041,27 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) up_write(&trace_event_sem); } +static bool event_in_systems(struct trace_event_call *call, + const char *systems) +{ + const char *system; + const char *p; + + if (!systems) + return true; + + system = call->class->system; + p = strstr(systems, system); + if (!p) + return false; + + if (p != systems && !isspace(*(p - 1)) && *(p - 1) != ',') + return false; + + p += strlen(system); + return !*p || isspace(*p) || *p == ','; +} + static struct trace_event_file * trace_create_new_event(struct trace_event_call *call, struct trace_array *tr) @@ -3050,9 +3071,12 @@ trace_create_new_event(struct trace_event_call *call, struct trace_event_file *file; unsigned int first; + if (!event_in_systems(call, trace_array_get_system_names(tr))) + return NULL; + file = kmem_cache_alloc(file_cachep, GFP_TRACE); if (!file) - return NULL; + return ERR_PTR(-ENOMEM); pid_list = rcu_dereference_protected(tr->filtered_pids, lockdep_is_held(&event_mutex)); @@ -3117,8 +3141,17 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) struct trace_event_file *file; file = trace_create_new_event(call, tr); + /* + * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed + * allocation, or NULL if the event is not part of the tr->system_names. + * When the event is not part of the tr->system_names, return zero, not + * an error. + */ if (!file) - return -ENOMEM; + return 0; + + if (IS_ERR(file)) + return PTR_ERR(file); if (eventdir_initialized) return event_create_dir(tr->event_dir, file); @@ -3157,8 +3190,17 @@ __trace_early_add_new_event(struct trace_event_call *call, int ret; file = trace_create_new_event(call, tr); + /* + * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed + * allocation, or NULL if the event is not part of the tr->system_names. + * When the event is not part of the tr->system_names, return zero, not + * an error. + */ if (!file) - return -ENOMEM; + return 0; + + if (IS_ERR(file)) + return PTR_ERR(file); ret = event_define_fields(call); if (ret) diff --git a/mm/Kconfig b/mm/Kconfig index 1d88bab99ff1..2b86f4fd9abd 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -994,6 +994,40 @@ config CMA_AREAS If unsure, leave the default value "7" in UMA and "19" in NUMA. +# +# Select this config option from the architecture Kconfig, if available, to set +# the max page order for physically contiguous allocations. +# +config ARCH_FORCE_MAX_ORDER + int + +# +# When ARCH_FORCE_MAX_ORDER is not defined, +# the default page block order is MAX_PAGE_ORDER (10) as per +# include/linux/mmzone.h. +# +config PAGE_BLOCK_ORDER + int "Page Block Order" + range 1 10 if ARCH_FORCE_MAX_ORDER = 0 || ARCH_FORCE_MAX_ORDER = "" + default 10 if ARCH_FORCE_MAX_ORDER = 0 || ARCH_FORCE_MAX_ORDER = "" + range 1 ARCH_FORCE_MAX_ORDER if ARCH_FORCE_MAX_ORDER != 0 + default ARCH_FORCE_MAX_ORDER if ARCH_FORCE_MAX_ORDER != 0 + help + The page block order refers to the power of two number of pages that + are physically contiguous and can have a migrate type associated to + them. The maximum size of the page block order is limited by + ARCH_FORCE_MAX_ORDER. + + This config allows overriding the default page block order when the + page block order is required to be smaller than ARCH_FORCE_MAX_ORDER + or MAX_ORDER. + + Reducing pageblock order can negatively impact THP generation + success rate. If your workloads uses THP heavily, please use this + option with caution. + + Don't change if unsure. + config MEM_SOFT_DIRTY bool "Track memory changes" depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5787f5e7b1ed..6ac6febf6fad 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -70,6 +70,7 @@ unsigned long transparent_hugepage_flags __read_mostly = (1<address, vmf->pmd); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); mm_inc_nr_ptes(vma->vm_mm); + deferred_split_folio(folio, false); spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); @@ -2953,7 +2977,7 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr, return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio); } -static void remap_page(struct folio *folio, unsigned long nr) +static void remap_page(struct folio *folio, unsigned long nr, int flags) { int i = 0; @@ -2961,7 +2985,7 @@ static void remap_page(struct folio *folio, unsigned long nr) if (!folio_test_anon(folio)) return; for (;;) { - remove_migration_ptes(folio, folio, true); + remove_migration_ptes(folio, folio, RMP_LOCKED | flags); i += folio_nr_pages(folio); if (i >= nr) break; @@ -3314,7 +3338,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, if (nr_dropped) shmem_uncharge(head->mapping->host, nr_dropped); - remap_page(folio, nr); + remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0); for (i = 0; i < nr; i++) { struct page *subpage = folio_dst_page(folio, i); @@ -3376,8 +3400,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) struct folio *folio = page_folio(page); struct deferred_split *ds_queue = get_deferred_split_queue(folio); XA_STATE(xas, &folio->mapping->i_pages, folio->index); - struct anon_vma *anon_vma = NULL; + bool is_anon = folio_test_anon(folio); struct address_space *mapping = NULL; + struct anon_vma *anon_vma = NULL; int extra_pins, ret; pgoff_t end; bool is_hzp; @@ -3394,7 +3419,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (folio_test_writeback(folio)) return -EBUSY; - if (folio_test_anon(folio)) { + if (is_anon) { /* * The caller does not necessarily hold an mmap_lock that would * prevent the anon_vma disappearing so we first we take a @@ -3495,6 +3520,14 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (folio_order(folio) > 1 && !list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; + if (folio_test_partially_mapped(folio)) + __folio_clear_partially_mapped(folio); + /* + * Reinitialize page_deferred_list after removing the + * page from the split_queue, otherwise a subsequent + * split will see list corruption when checking the + * page_deferred_list. + */ list_del_init(&folio->_deferred_list); } spin_unlock(&ds_queue->split_queue_lock); @@ -3522,7 +3555,7 @@ unfreeze: folio_ref_unfreeze(folio, 1 + extra_pins); remap: free_dst_pages(folio); - remap_page(folio, folio_nr_pages(folio)); + remap_page(folio, folio_nr_pages(folio), 0); } out_unlock: @@ -3572,6 +3605,8 @@ bool __folio_unqueue_deferred_split(struct folio *folio) spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (!list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; + if (folio_test_partially_mapped(folio)) + __folio_clear_partially_mapped(folio); list_del_init(&folio->_deferred_list); unqueued = true; } @@ -3580,7 +3615,8 @@ bool __folio_unqueue_deferred_split(struct folio *folio) return unqueued; /* useful for debug warnings */ } -void deferred_split_folio(struct folio *folio) +/* partially_mapped=false won't clear PG_partially_mapped folio flag */ +void deferred_split_folio(struct folio *folio, bool partially_mapped) { struct deferred_split *ds_queue = get_deferred_split_queue(folio); #ifdef CONFIG_MEMCG @@ -3595,6 +3631,9 @@ void deferred_split_folio(struct folio *folio) if (folio_order(folio) <= 1) return; + if (!partially_mapped && !split_underused_thp) + return; + /* * Exclude swapcache: originally to avoid a corrupt deferred split * queue. Nowadays that is fully prevented by mem_cgroup_swapout(); @@ -3605,13 +3644,20 @@ void deferred_split_folio(struct folio *folio) if (folio_test_swapcache(folio)) return; - if (!list_empty(&folio->_deferred_list)) - return; - spin_lock_irqsave(&ds_queue->split_queue_lock, flags); + if (partially_mapped) { + if (!folio_test_partially_mapped(folio)) { + __folio_set_partially_mapped(folio); + if (folio_test_pmd_mappable(folio)) + count_vm_event(THP_DEFERRED_SPLIT_PAGE); + count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED); + + } + } else { + /* partially mapped folios cannot become non-partially mapped */ + VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio); + } if (list_empty(&folio->_deferred_list)) { - count_vm_event(THP_DEFERRED_SPLIT_PAGE); - count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED); list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); ds_queue->split_queue_len++; #ifdef CONFIG_MEMCG @@ -3640,6 +3686,39 @@ static unsigned long deferred_split_count(struct shrinker *shrink, return READ_ONCE(ds_queue->split_queue_len); } +static bool thp_underused(struct folio *folio) +{ + int num_zero_pages = 0, num_filled_pages = 0; + void *kaddr; + int i; + + if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1) + return false; + + for (i = 0; i < folio_nr_pages(folio); i++) { + kaddr = kmap_local_folio(folio, i * PAGE_SIZE); + if (!memchr_inv(kaddr, 0, PAGE_SIZE)) { + num_zero_pages++; + if (num_zero_pages > khugepaged_max_ptes_none) { + kunmap_local(kaddr); + return true; + } + } else { + /* + * Another path for early exit once the number + * of non-zero filled pages exceeds threshold. + */ + num_filled_pages++; + if (num_filled_pages >= HPAGE_PMD_NR - khugepaged_max_ptes_none) { + kunmap_local(kaddr); + return false; + } + } + kunmap_local(kaddr); + } + return false; +} + static unsigned long deferred_split_scan(struct shrinker *shrink, struct shrink_control *sc) { @@ -3647,8 +3726,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, struct deferred_split *ds_queue = &pgdata->deferred_split_queue; unsigned long flags; LIST_HEAD(list); - struct folio *folio, *next; - int split = 0; + struct folio *folio, *next, *prev = NULL; + int split = 0, removed = 0; #ifdef CONFIG_MEMCG if (sc->memcg) @@ -3663,6 +3742,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, list_move(&folio->_deferred_list, &list); } else { /* We lost race with folio_put() */ + if (folio_test_partially_mapped(folio)) + __folio_clear_partially_mapped(folio); list_del_init(&folio->_deferred_list); ds_queue->split_queue_len--; } @@ -3672,20 +3753,55 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); list_for_each_entry_safe(folio, next, &list, _deferred_list) { + bool did_split = false; + bool underused = false; + + if (!folio_test_partially_mapped(folio)) { + underused = thp_underused(folio); + if (!underused) + goto next; + } if (!folio_trylock(folio)) goto next; - /* split_huge_page() removes page from list on success */ - if (!split_folio(folio)) + if (!split_folio(folio)) { + did_split = true; split++; + } folio_unlock(folio); next: - folio_put(folio); + /* + * split_folio() removes folio from list on success. + * Only add back to the queue if folio is partially mapped. + * If thp_underused returns false, or if split_folio fails + * in the case it was underused, then consider it used and + * don't add it back to split_queue. + */ + if (did_split) { + ; /* folio already removed from list */ + } else if (!folio_test_partially_mapped(folio)) { + list_del_init(&folio->_deferred_list); + removed++; + } else { + /* + * That unlocked list_del_init() above would be unsafe, + * unless its folio is separated from any earlier folios + * left on the list (which may be concurrently unqueued) + * by one safe folio with refcount still raised. + */ + swap(folio, prev); + } + if (folio) + folio_put(folio); } spin_lock_irqsave(&ds_queue->split_queue_lock, flags); list_splice_tail(&list, &ds_queue->split_queue); + ds_queue->split_queue_len -= removed; spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); + if (prev) + folio_put(prev); + /* * Stop shrinker if we didn't split any page, but the queue is empty. * This can happen if pages were freed under us. diff --git a/mm/internal.h b/mm/internal.h index 914e6968c8dc..da8bd4bfbb3e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -470,7 +470,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); #define K(x) ((x) << (PAGE_SHIFT-10)) extern char * const zone_names[MAX_NR_ZONES]; -extern unsigned long free_highatomics[MAX_NR_ZONES]; +extern unsigned long nr_free_highatomic[MAX_NR_ZONES]; /* perform sanity checks on struct pages being allocated or freed */ DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled); @@ -721,8 +721,9 @@ extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern int user_min_free_kbytes; -extern void free_unref_page(struct page *page, unsigned int order); -extern void free_unref_page_list(struct list_head *list); +void free_unref_page(struct page *page, unsigned int order); +void free_unref_folios(struct folio_batch *fbatch); +void free_unref_page_list(struct list_head *list); extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 4e88e83cdf23..c717d3f79961 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -84,7 +84,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); * * Note that these are only respected if collapse was initiated by khugepaged. */ -static unsigned int khugepaged_max_ptes_none __read_mostly; +unsigned int khugepaged_max_ptes_none __read_mostly; static unsigned int khugepaged_max_ptes_swap __read_mostly; static unsigned int khugepaged_max_ptes_shared __read_mostly; @@ -1218,6 +1218,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); + deferred_split_folio(folio, false); spin_unlock(pmd_ptl); hpage = NULL; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3b7f09e4f87e..ba4dc0146c54 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -95,6 +96,9 @@ static bool cgroup_memory_nokmem __ro_after_init; /* BPF memory accounting disabled? */ static bool cgroup_memory_nobpf __ro_after_init; +static struct kmem_cache *memcg_cachep; +static struct kmem_cache *memcg_pn_cachep; + #ifdef CONFIG_CGROUP_WRITEBACK static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); #endif @@ -5384,7 +5388,8 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; - pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node); + pn = kmem_cache_alloc_node(memcg_pn_cachep, GFP_KERNEL | __GFP_ZERO, + node); if (!pn) return 1; @@ -5440,7 +5445,7 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) int __maybe_unused i; long error = -ENOMEM; - memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL); + memcg = kmem_cache_zalloc(memcg_cachep, GFP_KERNEL); if (!memcg) return ERR_PTR(error); @@ -6017,8 +6022,6 @@ int mem_cgroup_move_account(struct folio *folio, css_get(&to->css); css_put(&from->css); - /* Warning should never happen, so don't worry about refcount non-0 */ - WARN_ON_ONCE(folio_unqueue_deferred_split(folio)); folio->memcg_data = (unsigned long)to; __folio_memcg_unlock(from); @@ -6389,9 +6392,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, enum mc_target_type target_type; union mc_target target; struct folio *folio; - bool tried_split_before = false; -retry_pmd: ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { if (mc.precharge < HPAGE_PMD_NR) { @@ -6401,27 +6402,6 @@ retry_pmd: target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); if (target_type == MC_TARGET_PAGE) { folio = target.folio; - /* - * Deferred split queue locking depends on memcg, - * and unqueue is unsafe unless folio refcount is 0: - * split or skip if on the queue? first try to split. - */ - if (!list_empty(&folio->_deferred_list)) { - spin_unlock(ptl); - if (!tried_split_before) - split_folio(folio); - folio_unlock(folio); - folio_put(folio); - if (tried_split_before) - return 0; - tried_split_before = true; - goto retry_pmd; - } - /* - * So long as that pmd lock is held, the folio cannot - * be racily added to the _deferred_list, because - * __folio_remove_rmap() will find !partially_mapped. - */ if (folio_isolate_lru(folio)) { if (!mem_cgroup_move_account(folio, true, mc.from, mc.to)) { @@ -7418,6 +7398,18 @@ void __mem_cgroup_uncharge_list(struct list_head *page_list) uncharge_batch(&ug); } +void __mem_cgroup_uncharge_folios(struct folio_batch *folios) +{ + struct uncharge_gather ug; + unsigned int i; + + uncharge_gather_clear(&ug); + for (i = 0; i < folios->nr; i++) + uncharge_folio(folios->folios[i], &ug); + if (ug.memcg) + uncharge_batch(&ug); +} + /** * mem_cgroup_replace_folio - Charge a folio's replacement. * @old: Currently circulating folio. @@ -7606,15 +7598,16 @@ static int __init cgroup_memory(char *s) __setup("cgroup.memory=", cgroup_memory); /* - * subsys_initcall() for memory controller. + * Memory controller init before cgroup_init() initialize root_mem_cgroup. * * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this * context because of lock dependencies (cgroup_lock -> cpu hotplug) but * basically everything that doesn't depend on a specific mem_cgroup structure * should be initialized from here. */ -static int __init mem_cgroup_init(void) +int __init mem_cgroup_init(void) { + unsigned int memcg_size; int cpu, node; /* @@ -7632,6 +7625,13 @@ static int __init mem_cgroup_init(void) INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, drain_local_stock); + memcg_size = struct_size_t(struct mem_cgroup, nodeinfo, nr_node_ids); + memcg_cachep = kmem_cache_create("mem_cgroup", memcg_size, 0, + SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL); + + memcg_pn_cachep = KMEM_CACHE(mem_cgroup_per_node, + SLAB_PANIC | SLAB_HWCACHE_ALIGN); + for_each_node(node) { struct mem_cgroup_tree_per_node *rtpn; @@ -7645,7 +7645,6 @@ static int __init mem_cgroup_init(void) return 0; } -subsys_initcall(mem_cgroup_init); #ifdef CONFIG_SWAP static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) diff --git a/mm/mempool.c b/mm/mempool.c index 734bcf5afbb7..45765275ca5b 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -19,6 +19,8 @@ #include #include #include "slab.h" +#undef CREATE_TRACE_POINTS +#include #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON) static void poison_error(mempool_t *pool, void *element, size_t size, @@ -383,6 +385,7 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) unsigned long flags; wait_queue_entry_t wait; gfp_t gfp_temp; + bool skip_wait = false; VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); might_alloc(gfp_mask); @@ -428,6 +431,11 @@ repeat_alloc: spin_unlock_irqrestore(&pool->lock, flags); return NULL; } + trace_android_vh_mempool_alloc_skip_wait(&gfp_temp, &skip_wait); + if (skip_wait) { + spin_unlock_irqrestore(&pool->lock, flags); + goto repeat_alloc; + } /* Let's wait for someone else to return an element to @pool */ init_wait(&wait); diff --git a/mm/migrate.c b/mm/migrate.c index ad5ac57404be..6e33057a8c32 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -182,13 +182,57 @@ void putback_movable_pages(struct list_head *l) } EXPORT_SYMBOL_GPL(putback_movable_pages); +static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw, + struct folio *folio, + unsigned long idx) +{ + struct page *page = folio_page(folio, idx); + bool contains_data; + pte_t newpte; + void *addr; + + VM_BUG_ON_PAGE(PageCompound(page), page); + VM_BUG_ON_PAGE(!PageAnon(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page); + + if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) || + mm_forbids_zeropage(pvmw->vma->vm_mm)) + return false; + + /* + * The pmd entry mapping the old thp was flushed and the pte mapping + * this subpage has been non present. If the subpage is only zero-filled + * then map it to the shared zeropage. + */ + addr = kmap_local_page(page); + contains_data = memchr_inv(addr, 0, PAGE_SIZE); + kunmap_local(addr); + + if (contains_data) + return false; + + newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address), + pvmw->vma->vm_page_prot)); + set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte); + + dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio)); + return true; +} + +struct rmap_walk_arg { + struct folio *folio; + bool map_unused_to_zeropage; +}; + /* * Restore a potential migration pte to a working pte entry */ static bool remove_migration_pte(struct folio *dst, struct vm_area_struct *vma, unsigned long addr, void *arg) { - struct folio *src = arg; + struct rmap_walk_arg *rmap_walk_arg = arg; + struct folio *src = rmap_walk_arg->folio; DEFINE_FOLIO_VMA_WALK(pvmw, src, vma, addr, PVMW_SYNC | PVMW_MIGRATION); while (page_vma_mapped_walk(&pvmw)) { @@ -228,6 +272,9 @@ static bool remove_migration_pte(struct folio *dst, continue; } #endif + if (rmap_walk_arg->map_unused_to_zeropage && + try_to_map_unused_to_zeropage(&pvmw, folio, idx)) + continue; folio_get(folio); pte = mk_pte(page, READ_ONCE(vma->vm_page_prot)); @@ -303,14 +350,21 @@ static bool remove_migration_pte(struct folio *dst, * Get rid of all migration entries and replace them by * references to the indicated page. */ -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked) +void remove_migration_ptes(struct folio *src, struct folio *dst, int flags) { - struct rmap_walk_control rwc = { - .rmap_one = remove_migration_pte, - .arg = src, + struct rmap_walk_arg rmap_walk_arg = { + .folio = src, + .map_unused_to_zeropage = flags & RMP_USE_SHARED_ZEROPAGE, }; - if (locked) + struct rmap_walk_control rwc = { + .rmap_one = remove_migration_pte, + .arg = &rmap_walk_arg, + }; + + VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src != dst), src); + + if (flags & RMP_LOCKED) rmap_walk_locked(dst, &rwc); else rmap_walk(dst, &rwc); @@ -461,7 +515,8 @@ int folio_migrate_mapping(struct address_space *mapping, } /* Take off deferred split queue while frozen and memcg set */ - folio_unqueue_deferred_split(folio); + if (folio_test_large(folio) && folio_test_large_rmappable(folio)) + folio_unqueue_deferred_split(folio); /* * Now we know that no one else is looking at the folio: @@ -933,7 +988,7 @@ static int writeout(struct address_space *mapping, struct folio *folio) * At this point we know that the migration attempt cannot * be successful. */ - remove_migration_ptes(folio, folio, false); + remove_migration_ptes(folio, folio, 0); rc = mapping->a_ops->writepage(&folio->page, &wbc); @@ -1096,7 +1151,7 @@ static void migrate_folio_undo_src(struct folio *src, struct list_head *ret) { if (page_was_mapped) - remove_migration_ptes(src, src, false); + remove_migration_ptes(src, src, 0); /* Drop an anon_vma reference if we took one */ if (anon_vma) put_anon_vma(anon_vma); @@ -1335,7 +1390,7 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, lru_add_drain(); if (old_page_state & PAGE_WAS_MAPPED) - remove_migration_ptes(src, dst, false); + remove_migration_ptes(src, dst, 0); out_unlock_both: folio_unlock(dst); @@ -1474,7 +1529,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, if (page_was_mapped) remove_migration_ptes(src, - rc == MIGRATEPAGE_SUCCESS ? dst : src, false); + rc == MIGRATEPAGE_SUCCESS ? dst : src, 0); unlock_put_anon: folio_unlock(dst); @@ -1702,6 +1757,35 @@ static int migrate_pages_batch(struct list_head *from, cond_resched(); + /* + * The rare folio on the deferred split list should + * be split now. It should not count as a failure: + * but increment nr_failed because, without doing so, + * migrate_pages() may report success with (split but + * unmigrated) pages still on its fromlist; whereas it + * always reports success when its fromlist is empty. + * + * Only check it without removing it from the list. + * Since the folio can be on deferred_split_scan() + * local list and removing it can cause the local list + * corruption. Folio split process below can handle it + * with the help of folio_ref_freeze(). + * + * nr_pages > 2 is needed to avoid checking order-1 + * page cache folios. They exist, in contrast to + * non-existent order-1 anonymous folios, and do not + * use _deferred_list. + */ + if (nr_pages > 2 && + !list_empty(&folio->_deferred_list) && + folio_test_partially_mapped(folio)) { + if (!try_split_folio(folio, split_folios, mode)) { + nr_failed++; + stats->nr_thp_split += is_thp; + continue; + } + } + /* * Large folio migration might be unsupported or * the allocation might be failed so we should retry diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 1bebdfae2286..38815d3dd72d 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -422,7 +422,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, continue; folio = page_folio(page); - remove_migration_ptes(folio, folio, false); + remove_migration_ptes(folio, folio, 0); src_pfns[i] = 0; folio_unlock(folio); @@ -840,7 +840,7 @@ void migrate_device_finalize(unsigned long *src_pfns, src = page_folio(page); dst = page_folio(newpage); - remove_migration_ptes(src, dst, false); + remove_migration_ptes(src, dst, 0); folio_unlock(src); if (is_zone_device_page(page)) diff --git a/mm/mlock.c b/mm/mlock.c index 51234dc99e74..36898c50f537 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -208,8 +208,7 @@ static void mlock_folio_batch(struct folio_batch *fbatch) if (lruvec) unlock_page_lruvec_irq(lruvec); - folios_put(fbatch->folios, folio_batch_count(fbatch)); - folio_batch_reinit(fbatch); + folios_put(fbatch); } void mlock_drain_local(void) diff --git a/mm/mm_init.c b/mm/mm_init.c index dad55dbde265..57cbca52fb56 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1558,7 +1558,7 @@ static inline void setup_usemap(struct zone *zone) {} /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ void __init set_pageblock_order(void) { - unsigned int order = MAX_ORDER; + unsigned int order = PAGE_BLOCK_ORDER; /* Check that pageblock_nr_pages has not already been setup */ if (pageblock_order) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5badced39c1a..b42afcd0d3c3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -323,7 +324,7 @@ const char * const migratetype_names[MIGRATE_TYPES] = { #endif }; -unsigned long free_highatomics[MAX_NR_ZONES] = {0}; +unsigned long nr_free_highatomic[MAX_NR_ZONES] = {0}; int min_free_kbytes = 1024; int user_min_free_kbytes = -1; @@ -770,8 +771,8 @@ static inline void account_freepages(struct zone *zone, int nr_pages, if (is_migrate_cma(migratetype)) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); else if (is_migrate_highatomic(migratetype)) - WRITE_ONCE(free_highatomics[zone_idx(zone)], - free_highatomics[zone_idx(zone)] + nr_pages); + WRITE_ONCE(nr_free_highatomic[zone_idx(zone)], + nr_free_highatomic[zone_idx(zone)] + nr_pages); } /* Used for pages not on another list */ @@ -921,7 +922,6 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON_PAGE(page->flags & check_flags, page); VM_BUG_ON(migratetype == -1); - VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); @@ -1237,6 +1237,7 @@ static __always_inline bool free_pages_prepare(struct page *page, } } (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + trace_android_vh_mm_free_page(page + i); } } if (PageMappingFlags(page)) @@ -1252,6 +1253,7 @@ static __always_inline bool free_pages_prepare(struct page *page, page_cpupid_reset_last(page); page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + trace_android_vh_mm_free_page(page); reset_page_owner(page, order); free_page_pinner(page, order); page_table_check_free(page, order); @@ -1372,7 +1374,6 @@ static void free_one_page(struct zone *zone, struct page *page, static void __free_pages_ok(struct page *page, unsigned int order, fpi_t fpi_flags) { - unsigned long flags; int migratetype; unsigned long pfn = page_to_pfn(page); struct zone *zone = page_zone(page); @@ -1392,21 +1393,17 @@ skip_prepare: fpi_flags, &skip_free_pages_ok); if (skip_free_pages_ok) return; - - spin_lock_irqsave(&zone->lock, flags); + /* + * Calling get_pfnblock_migratetype() without spin_lock_irqsave() here + * is used to avoid calling get_pfnblock_migratetype() under the lock. + * This will reduce the lock holding time. + */ migratetype = get_pfnblock_migratetype(page, pfn); trace_android_vh_free_unref_page_bypass(page, order, migratetype, &skip_free_unref_page); - if (skip_free_unref_page) { - spin_unlock_irqrestore(&zone->lock, flags); + if (skip_free_unref_page) return; - } - if (unlikely(has_isolate_pageblock(zone) || - is_migrate_isolate(migratetype))) { - migratetype = get_pfnblock_migratetype(page, pfn); - } - __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); - spin_unlock_irqrestore(&zone->lock, flags); + free_one_page(zone, page, pfn, order, fpi_flags); __count_vm_events(PGFREE, 1 << order); } @@ -2249,8 +2246,8 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, struct zone *zone; struct page *page; int order; - int ret; bool skip_unreserve_highatomic = false; + int ret; for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, ac->nodemask) { @@ -2765,7 +2762,7 @@ void free_unref_page(struct page *page, unsigned int order) return; if (unlikely(migratetype > MIGRATE_RECLAIMABLE)) { if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(page_zone(page), page, pfn, order, FPI_NONE); + free_one_page(page_zone(page), page, pfn, order, FPI_NONE); return; } #ifdef CONFIG_CMA @@ -2781,64 +2778,65 @@ void free_unref_page(struct page *page, unsigned int order) free_unref_page_commit(zone, pcp, page, migratetype, order); pcp_spin_unlock(pcp); } else { - free_one_page(zone, page, pfn, order, FPI_NONE); + free_one_page(zone, page, pfn, order, FPI_NONE); } pcp_trylock_finish(UP_flags); } /* - * Free a list of 0-order pages + * Free a batch of folios */ -void free_unref_page_list(struct list_head *list) +void free_unref_folios(struct folio_batch *folios) { unsigned long __maybe_unused UP_flags; - struct page *page, *next; struct per_cpu_pages *pcp = NULL; struct zone *locked_zone = NULL; - int batch_count = 0; - int migratetype; - bool skip_free = false; + int i, j; - /* Prepare pages for freeing */ - list_for_each_entry_safe(page, next, list, lru) { - unsigned long pfn = page_to_pfn(page); - if (!free_pages_prepare(page, 0, FPI_NONE)) { - list_del(&page->lru); + /* Prepare folios for freeing */ + for (i = 0, j = 0; i < folios->nr; i++) { + struct folio *folio = folios->folios[i]; + unsigned long pfn = folio_pfn(folio); + unsigned int order = folio_order(folio); + + if (order > 0 && folio_test_large_rmappable(folio)) + folio_unqueue_deferred_split(folio); + if (!free_pages_prepare(&folio->page, order, FPI_NONE)) continue; - } - /* - * Free isolated pages directly to the allocator, see - * comment in free_unref_page. + * Free orders not handled on the PCP directly to the + * allocator. */ - migratetype = get_pfnblock_migratetype(page, pfn); - if (unlikely(is_migrate_isolate(migratetype))) { - list_del(&page->lru); - free_one_page(page_zone(page), page, pfn, 0, FPI_NONE); + if (!pcp_allowed_order(order)) { + free_one_page(folio_zone(folio), &folio->page, + pfn, order, FPI_NONE); continue; } + folio->private = (void *)(unsigned long)order; + if (j != i) + folios->folios[j] = folio; + j++; } + folios->nr = j; - trace_android_vh_free_unref_page_list_bypass(list, &skip_free); - if (skip_free) - return; + for (i = 0; i < folios->nr; i++) { + struct folio *folio = folios->folios[i]; + struct zone *zone = folio_zone(folio); + unsigned long pfn = folio_pfn(folio); + unsigned int order = (unsigned long)folio->private; + int migratetype; - list_for_each_entry_safe(page, next, list, lru) { - struct zone *zone = page_zone(page); - unsigned long pfn = page_to_pfn(page); + folio->private = NULL; + migratetype = get_pfnblock_migratetype(&folio->page, pfn); - list_del(&page->lru); - migratetype = get_pfnblock_migratetype(page, pfn); - - /* - * Either different zone requiring a different pcp lock or - * excessive lock hold times when freeing a large list of - * pages. - */ - if (zone != locked_zone || batch_count == SWAP_CLUSTER_MAX) { + /* Different zone requires a different pcp lock */ + if (zone != locked_zone || + is_migrate_isolate(migratetype)) { if (pcp) { pcp_spin_unlock(pcp); pcp_trylock_finish(UP_flags); + locked_zone = NULL; + pcp = NULL; } /* @@ -2846,24 +2844,21 @@ void free_unref_page_list(struct list_head *list) * allocator, see comment in free_unref_page. */ if (is_migrate_isolate(migratetype)) { - free_one_page(zone, page, page_to_pfn(page), - 0, FPI_NONE); + free_one_page(zone, &folio->page, pfn, + order, FPI_NONE); continue; - } - - batch_count = 0; + } /* - * trylock is necessary as pages may be getting freed + * trylock is necessary as folios may be getting freed * from IRQ or SoftIRQ context after an IO completion. */ pcp_trylock_prepare(UP_flags); pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (unlikely(!pcp)) { pcp_trylock_finish(UP_flags); - free_one_page(zone, page, pfn, - 0, FPI_NONE); - locked_zone = NULL; + free_one_page(zone, &folio->page, pfn, + order, FPI_NONE); continue; } locked_zone = zone; @@ -2880,15 +2875,39 @@ void free_unref_page_list(struct list_head *list) migratetype = MIGRATE_MOVABLE; } - trace_mm_page_free_batched(page); - free_unref_page_commit(zone, pcp, page, migratetype, 0); - batch_count++; + trace_mm_page_free_batched(&folio->page); + free_unref_page_commit(zone, pcp, &folio->page, migratetype, + order); } if (pcp) { pcp_spin_unlock(pcp); pcp_trylock_finish(UP_flags); } + folio_batch_reinit(folios); +} + +void free_unref_page_list(struct list_head *list) +{ + struct folio_batch fbatch; + bool skip_free = false; + + trace_android_vh_free_unref_page_list_bypass(list, &skip_free); + if (skip_free) + return; + + folio_batch_init(&fbatch); + while (!list_empty(list)) { + struct folio *folio = list_first_entry(list, struct folio, lru); + + list_del(&folio->lru); + if (folio_batch_add(&fbatch, folio) > 0) + continue; + free_unref_folios(&fbatch); + } + + if (fbatch.nr) + free_unref_folios(&fbatch); } /* @@ -3216,7 +3235,7 @@ static inline long __zone_watermark_unusable_free(struct zone *z, * watermark then subtract the free pages reserved for highatomic. */ if (likely(!(alloc_flags & ALLOC_RESERVES))) - unusable_free += READ_ONCE(free_highatomics[zone_idx(z)]); + unusable_free += READ_ONCE(nr_free_highatomic[zone_idx(z)]); #ifdef CONFIG_CMA /* If allocation can't use CMA areas don't use free CMA pages */ diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 28c3f5aa0411..e8fce2cd72fc 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -417,9 +417,9 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, ret = __alloc_contig_migrate_range(&cc, head_pfn, head_pfn + nr_pages, page_mt); + if (ret) goto failed; - pfn = head_pfn + nr_pages; continue; } diff --git a/mm/pgsize_migration.c b/mm/pgsize_migration.c index 3b020924a7a9..f72240b0de79 100644 --- a/mm/pgsize_migration.c +++ b/mm/pgsize_migration.c @@ -270,6 +270,9 @@ static const struct vm_operations_struct pad_vma_ops = { .name = pad_vma_name, }; +/* Defined in kernel/fork.c */ +extern struct kmem_cache *vm_area_cachep; + /* * Returns a new VMA representing the padding in @vma; * returns NULL if no padding in @vma or allocation failed. @@ -281,7 +284,7 @@ static struct vm_area_struct *get_pad_vma(struct vm_area_struct *vma) if (!is_pgsize_migration_enabled() || !(vma->vm_flags & VM_PAD_MASK)) return NULL; - pad = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL); + pad = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!pad) { pr_warn("Page size migration: Failed to allocate padding VMA"); return NULL; @@ -347,7 +350,7 @@ void show_map_pad_vma(struct vm_area_struct *vma, struct seq_file *m, else ((show_pad_maps_fn)func)(m, pad); - kfree(pad); + kmem_cache_free(vm_area_cachep, pad); } /* diff --git a/mm/rmap.c b/mm/rmap.c index 131ebf96fdd0..12a6f9d9af85 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1599,8 +1599,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, * Check partially_mapped first to ensure it is a large folio. */ if (folio_test_anon(folio) && partially_mapped && - list_empty(&folio->_deferred_list)) - deferred_split_folio(folio); + !folio_test_partially_mapped(folio)) + deferred_split_folio(folio, true); } /* diff --git a/mm/show_mem.c b/mm/show_mem.c index ecc8bc2c030c..1bda780cb5df 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -342,7 +342,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), K(zone->nr_reserved_highatomic), - K(free_highatomics[zone_idx(zone)]), + K(nr_free_highatomic[zone_idx(zone)]), K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), diff --git a/mm/swap.c b/mm/swap.c index ca42ec485ba0..174259a9a5f7 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -77,26 +77,33 @@ static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = { .lock = INIT_LOCAL_LOCK(lock), }; +static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp, + unsigned long *flagsp) +{ + if (folio_test_lru(folio)) { + folio_lruvec_relock_irqsave(folio, lruvecp, flagsp); + lruvec_del_folio(*lruvecp, folio); + __folio_clear_lru_flags(folio); + } +} + /* * This path almost never happens for VM activity - pages are normally freed * in batches. But it gets used by networking - and for compound pages. */ -static void __page_cache_release(struct folio *folio) +static void page_cache_release(struct folio *folio) { - if (folio_test_lru(folio)) { - struct lruvec *lruvec; - unsigned long flags; + struct lruvec *lruvec = NULL; + unsigned long flags; - lruvec = folio_lruvec_lock_irqsave(folio, &flags); - lruvec_del_folio(lruvec, folio); - __folio_clear_lru_flags(folio); + __page_cache_release(folio, &lruvec, &flags); + if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); - } } static void __folio_put_small(struct folio *folio) { - __page_cache_release(folio); + page_cache_release(folio); mem_cgroup_uncharge(folio); free_unref_page(&folio->page, 0); } @@ -110,7 +117,7 @@ static void __folio_put_large(struct folio *folio) * be called for hugetlb (it has a separate hugetlb_cgroup.) */ if (!folio_test_hugetlb(folio)) - __page_cache_release(folio); + page_cache_release(folio); destroy_large_folio(folio); } @@ -133,22 +140,25 @@ EXPORT_SYMBOL(__folio_put); */ void put_pages_list(struct list_head *pages) { + struct folio_batch fbatch; struct folio *folio, *next; + folio_batch_init(&fbatch); list_for_each_entry_safe(folio, next, pages, lru) { - if (!folio_put_testzero(folio)) { - list_del(&folio->lru); + if (!folio_put_testzero(folio)) continue; - } if (folio_test_large(folio)) { - list_del(&folio->lru); __folio_put_large(folio); continue; } /* LRU flag must be clear because it's passed using the lru */ + if (folio_batch_add(&fbatch, folio) > 0) + continue; + free_unref_folios(&fbatch); } - free_unref_page_list(pages); + if (fbatch.nr) + free_unref_folios(&fbatch); INIT_LIST_HEAD(pages); } EXPORT_SYMBOL(put_pages_list); @@ -170,7 +180,7 @@ static void lru_add_fn(struct lruvec *lruvec, struct folio *folio) * while the LRU lock is held. * * (That is not true of __page_cache_release(), and not necessarily - * true of release_pages(): but those only clear the mlocked flag after + * true of folios_put(): but those only clear the mlocked flag after * folio_put_testzero() has excluded any other users of the folio.) */ if (folio_evictable(folio)) { @@ -208,7 +218,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) if (move_fn != lru_add_fn && !folio_test_clear_lru(folio)) continue; - lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); + folio_lruvec_relock_irqsave(folio, &lruvec, &flags); move_fn(lruvec, folio); folio_set_lru(folio); @@ -216,8 +226,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); - folios_put(fbatch->folios, folio_batch_count(fbatch)); - folio_batch_reinit(fbatch); + folios_put(fbatch); } static void folio_batch_add_and_move(struct folio_batch *fbatch, @@ -958,47 +967,29 @@ void lru_cache_disable(void) EXPORT_SYMBOL_GPL(lru_cache_disable); /** - * release_pages - batched put_page() - * @arg: array of pages to release - * @nr: number of pages + * folios_put_refs - Reduce the reference count on a batch of folios. + * @folios: The folios. + * @refs: The number of refs to subtract from each folio. * - * Decrement the reference count on all the pages in @arg. If it - * fell to zero, remove the page from the LRU and free it. + * Like folio_put(), but for a batch of folios. This is more efficient + * than writing the loop yourself as it will optimise the locks which need + * to be taken if the folios are freed. The folios batch is returned + * empty and ready to be reused for another batch; there is no need + * to reinitialise it. If @refs is NULL, we subtract one from each + * folio refcount. * - * Note that the argument can be an array of pages, encoded pages, - * or folio pointers. We ignore any encoded bits, and turn any of - * them into just a folio that gets free'd. + * Context: May be called in process or interrupt context, but not in NMI + * context. May be called while holding a spinlock. */ -void release_pages(release_pages_arg arg, int nr) +void folios_put_refs(struct folio_batch *folios, unsigned int *refs) { - int i; - struct encoded_page **encoded = arg.encoded_pages; - LIST_HEAD(pages_to_free); + int i, j; struct lruvec *lruvec = NULL; unsigned long flags = 0; - unsigned int lock_batch; - for (i = 0; i < nr; i++) { - unsigned int nr_refs = 1; - struct folio *folio; - - /* Turn any of the argument types into a folio */ - folio = page_folio(encoded_page_ptr(encoded[i])); - - /* Is our next entry actually "nr_pages" -> "nr_refs" ? */ - if (unlikely(encoded_page_flags(encoded[i]) & - ENCODED_PAGE_BIT_NR_PAGES_NEXT)) - nr_refs = encoded_nr_pages(encoded[++i]); - - /* - * Make sure the IRQ-safe lock-holding time does not get - * excessive with a continuous string of pages from the - * same lruvec. The lock is held only if lruvec != NULL. - */ - if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) { - unlock_page_lruvec_irqrestore(lruvec, flags); - lruvec = NULL; - } + for (i = 0, j = 0; i < folios->nr; i++) { + struct folio *folio = folios->folios[i]; + unsigned int nr_refs = refs ? refs[i] : 1; if (is_huge_zero_page(&folio->page)) continue; @@ -1018,34 +1009,73 @@ void release_pages(release_pages_arg arg, int nr) if (!folio_ref_sub_and_test(folio, nr_refs)) continue; - if (folio_test_large(folio)) { + /* hugetlb has its own memcg */ + if (folio_test_hugetlb(folio)) { if (lruvec) { unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } - __folio_put_large(folio); + free_huge_folio(folio); continue; } - if (folio_test_lru(folio)) { - struct lruvec *prev_lruvec = lruvec; + folio_unqueue_deferred_split(folio); + __page_cache_release(folio, &lruvec, &flags); - lruvec = folio_lruvec_relock_irqsave(folio, lruvec, - &flags); - if (prev_lruvec != lruvec) - lock_batch = 0; - - lruvec_del_folio(lruvec, folio); - __folio_clear_lru_flags(folio); - } - - list_add(&folio->lru, &pages_to_free); + if (j != i) + folios->folios[j] = folio; + j++; } if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); + if (!j) { + folio_batch_reinit(folios); + return; + } - mem_cgroup_uncharge_list(&pages_to_free); - free_unref_page_list(&pages_to_free); + folios->nr = j; + mem_cgroup_uncharge_folios(folios); + free_unref_folios(folios); +} +EXPORT_SYMBOL(folios_put_refs); + +/** + * release_pages - batched put_page() + * @arg: array of pages to release + * @nr: number of pages + * + * Decrement the reference count on all the pages in @arg. If it + * fell to zero, remove the page from the LRU and free it. + * + * Note that the argument can be an array of pages, encoded pages, + * or folio pointers. We ignore any encoded bits, and turn any of + * them into just a folio that gets free'd. + */ +void release_pages(release_pages_arg arg, int nr) +{ + struct folio_batch fbatch; + int refs[PAGEVEC_SIZE]; + struct encoded_page **encoded = arg.encoded_pages; + int i; + + folio_batch_init(&fbatch); + for (i = 0; i < nr; i++) { + /* Turn any of the argument types into a folio */ + struct folio *folio = page_folio(encoded_page_ptr(encoded[i])); + + /* Is our next entry actually "nr_pages" -> "nr_refs" ? */ + refs[fbatch.nr] = 1; + if (unlikely(encoded_page_flags(encoded[i]) & + ENCODED_PAGE_BIT_NR_PAGES_NEXT)) + refs[fbatch.nr] = encoded_nr_pages(encoded[++i]); + + if (folio_batch_add(&fbatch, folio) > 0) + continue; + folios_put_refs(&fbatch, refs); + } + + if (fbatch.nr) + folios_put_refs(&fbatch, refs); } EXPORT_SYMBOL(release_pages); @@ -1065,8 +1095,7 @@ void __folio_batch_release(struct folio_batch *fbatch) lru_add_drain(); fbatch->percpu_pvec_drained = true; } - release_pages(fbatch->folios, folio_batch_count(fbatch)); - folio_batch_reinit(fbatch); + folios_put(fbatch); } EXPORT_SYMBOL(__folio_batch_release); diff --git a/mm/vmstat.c b/mm/vmstat.c index 38fab5c85304..bfea0422afe4 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1358,6 +1358,7 @@ const char * const vmstat_text[] = { "thp_split_page", "thp_split_page_failed", "thp_deferred_split_page", + "thp_underused_split_page", "thp_split_pmd", "thp_shatter_page", "thp_shatter_page_failed", diff --git a/net/Makefile b/net/Makefile index 4c4dc535453d..45f3fbaae644 100644 --- a/net/Makefile +++ b/net/Makefile @@ -17,7 +17,7 @@ obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_TLS) += tls/ obj-$(CONFIG_XFRM) += xfrm/ -obj-$(CONFIG_UNIX_SCM) += unix/ +obj-$(CONFIG_UNIX) += unix/ obj-y += ipv6/ obj-$(CONFIG_BPFILTER) += bpfilter/ obj-$(CONFIG_PACKET) += packet/ diff --git a/net/core/scm.c b/net/core/scm.c index 737917c7ac62..df90de176e80 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -36,6 +36,7 @@ #include #include #include +#include /* @@ -85,8 +86,15 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) return -ENOMEM; *fplp = fpl; fpl->count = 0; + fpl->count_unix = 0; fpl->max = SCM_MAX_FD; fpl->user = NULL; +#if IS_ENABLED(CONFIG_UNIX) + fpl->inflight = false; + fpl->dead = false; + fpl->edges = NULL; + INIT_LIST_HEAD(&fpl->vertices); +#endif } fpp = &fpl->fp[fpl->count]; @@ -109,6 +117,9 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) fput(file); return -EINVAL; } + if (unix_get_socket(file)) + fpl->count_unix++; + *fpp++ = file; fpl->count++; } @@ -366,13 +377,18 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) if (!fpl) return NULL; - new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]), + new_fpl = kmemdup(fpl, sizeof(*fpl), GFP_KERNEL_ACCOUNT); if (new_fpl) { for (i = 0; i < fpl->count; i++) get_file(fpl->fp[i]); new_fpl->max = new_fpl->count; new_fpl->user = get_uid(fpl->user); +#if IS_ENABLED(CONFIG_UNIX) + new_fpl->inflight = false; + new_fpl->edges = NULL; + INIT_LIST_HEAD(&new_fpl->vertices); +#endif } return new_fpl; } diff --git a/net/unix/Kconfig b/net/unix/Kconfig index 28b232f281ab..8b5d04210d7c 100644 --- a/net/unix/Kconfig +++ b/net/unix/Kconfig @@ -16,11 +16,6 @@ config UNIX Say Y unless you know what you are doing. -config UNIX_SCM - bool - depends on UNIX - default y - config AF_UNIX_OOB bool depends on UNIX diff --git a/net/unix/Makefile b/net/unix/Makefile index 20491825b4d0..4ddd125c4642 100644 --- a/net/unix/Makefile +++ b/net/unix/Makefile @@ -11,5 +11,3 @@ unix-$(CONFIG_BPF_SYSCALL) += unix_bpf.o obj-$(CONFIG_UNIX_DIAG) += unix_diag.o unix_diag-y := diag.o - -obj-$(CONFIG_UNIX_SCM) += scm.o diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 2c16fd81fdce..542b4610a95e 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -117,8 +117,6 @@ #include #include -#include "scm.h" - static atomic_long_t unix_nr_socks; static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; @@ -980,11 +978,11 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen); sk->sk_destruct = unix_sock_destructor; u = unix_sk(sk); - u->inflight = 0; + u->listener = NULL; + u->vertex = NULL; u->path.dentry = NULL; u->path.mnt = NULL; spin_lock_init(&u->lock); - INIT_LIST_HEAD(&u->link); mutex_init(&u->iolock); /* single task reading lock */ mutex_init(&u->bindlock); /* single task binding lock */ init_waitqueue_head(&u->peer_wait); @@ -1583,6 +1581,7 @@ restart: newsk->sk_type = sk->sk_type; init_peercred(newsk); newu = unix_sk(newsk); + newu->listener = other; RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); otheru = unix_sk(other); @@ -1678,8 +1677,8 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { struct sock *sk = sock->sk; - struct sock *tsk; struct sk_buff *skb; + struct sock *tsk; int err; err = -EOPNOTSUPP; @@ -1709,6 +1708,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, /* attach accepted sock to socket */ unix_state_lock(tsk); + unix_update_edges(unix_sk(tsk)); newsock->state = SS_CONNECTED; unix_sock_inherit_flags(sock, newsock); sock_graft(tsk, newsock); @@ -1752,51 +1752,65 @@ out: return err; } +/* The "user->unix_inflight" variable is protected by the garbage + * collection lock, and we just read it locklessly here. If you go + * over the limit, there might be a tiny race in actually noticing + * it across threads. Tough. + */ +static inline bool too_many_unix_fds(struct task_struct *p) +{ + struct user_struct *user = current_user(); + + if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) + return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); + return false; +} + +static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) +{ + if (too_many_unix_fds(current)) + return -ETOOMANYREFS; + + /* Need to duplicate file references for the sake of garbage + * collection. Otherwise a socket in the fps might become a + * candidate for GC while the skb is not yet queued. + */ + UNIXCB(skb).fp = scm_fp_dup(scm->fp); + if (!UNIXCB(skb).fp) + return -ENOMEM; + + if (unix_prepare_fpl(UNIXCB(skb).fp)) + return -ENOMEM; + + return 0; +} + +static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) +{ + scm->fp = UNIXCB(skb).fp; + UNIXCB(skb).fp = NULL; + + unix_destroy_fpl(scm->fp); +} + static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) { scm->fp = scm_fp_dup(UNIXCB(skb).fp); +} - /* - * Garbage collection of unix sockets starts by selecting a set of - * candidate sockets which have reference only from being in flight - * (total_refs == inflight_refs). This condition is checked once during - * the candidate collection phase, and candidates are marked as such, so - * that non-candidates can later be ignored. While inflight_refs is - * protected by unix_gc_lock, total_refs (file count) is not, hence this - * is an instantaneous decision. - * - * Once a candidate, however, the socket must not be reinstalled into a - * file descriptor while the garbage collection is in progress. - * - * If the above conditions are met, then the directed graph of - * candidates (*) does not change while unix_gc_lock is held. - * - * Any operations that changes the file count through file descriptors - * (dup, close, sendmsg) does not change the graph since candidates are - * not installed in fds. - * - * Dequeing a candidate via recvmsg would install it into an fd, but - * that takes unix_gc_lock to decrement the inflight count, so it's - * serialized with garbage collection. - * - * MSG_PEEK is special in that it does not change the inflight count, - * yet does install the socket into an fd. The following lock/unlock - * pair is to ensure serialization with garbage collection. It must be - * done between incrementing the file count and installing the file into - * an fd. - * - * If garbage collection starts after the barrier provided by the - * lock/unlock, then it will see the elevated refcount and not mark this - * as a candidate. If a garbage collection is already in progress - * before the file count was incremented, then the lock/unlock pair will - * ensure that garbage collection is finished before progressing to - * installing the fd. - * - * (*) A -> B where B is on the queue of A or B is on the queue of C - * which is on the queue of listening socket A. - */ - spin_lock(&unix_gc_lock); - spin_unlock(&unix_gc_lock); +static void unix_destruct_scm(struct sk_buff *skb) +{ + struct scm_cookie scm; + + memset(&scm, 0, sizeof(scm)); + scm.pid = UNIXCB(skb).pid; + if (UNIXCB(skb).fp) + unix_detach_fds(&scm, skb); + + /* Alas, it calls VFS */ + /* So fscking what? fput() had been SMP-safe since the last Summer */ + scm_destroy(&scm); + sock_wfree(skb); } static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) @@ -1855,8 +1869,10 @@ static void scm_stat_add(struct sock *sk, struct sk_buff *skb) struct scm_fp_list *fp = UNIXCB(skb).fp; struct unix_sock *u = unix_sk(sk); - if (unlikely(fp && fp->count)) + if (unlikely(fp && fp->count)) { atomic_add(fp->count, &u->scm_stat.nr_fds); + unix_add_edges(fp, u); + } } static void scm_stat_del(struct sock *sk, struct sk_buff *skb) @@ -1864,8 +1880,10 @@ static void scm_stat_del(struct sock *sk, struct sk_buff *skb) struct scm_fp_list *fp = UNIXCB(skb).fp; struct unix_sock *u = unix_sk(sk); - if (unlikely(fp && fp->count)) + if (unlikely(fp && fp->count)) { atomic_sub(fp->count, &u->scm_stat.nr_fds); + unix_del_edges(fp); + } } /* @@ -1885,11 +1903,12 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, long timeo; int err; - wait_for_unix_gc(); err = scm_send(sock, msg, &scm, false); if (err < 0) return err; + wait_for_unix_gc(scm.fp); + err = -EOPNOTSUPP; if (msg->msg_flags&MSG_OOB) goto out; @@ -2157,11 +2176,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, bool fds_sent = false; int data_len; - wait_for_unix_gc(); err = scm_send(sock, msg, &scm, false); if (err < 0) return err; + wait_for_unix_gc(scm.fp); + err = -EOPNOTSUPP; if (msg->msg_flags & MSG_OOB) { #if IS_ENABLED(CONFIG_AF_UNIX_OOB) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index a2a8543613a5..23efb78fe9ef 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -81,249 +81,519 @@ #include #include -#include "scm.h" - -/* Internal data structures and random procedures: */ - -static LIST_HEAD(gc_candidates); - -static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), - struct sk_buff_head *hitlist) +struct unix_sock *unix_get_socket(struct file *filp) { - struct sk_buff *skb; - struct sk_buff *next; + struct inode *inode = file_inode(filp); - spin_lock(&x->sk_receive_queue.lock); - skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { - /* Do we have file descriptors ? */ - if (UNIXCB(skb).fp) { - bool hit = false; - /* Process the descriptors of this socket */ - int nfd = UNIXCB(skb).fp->count; - struct file **fp = UNIXCB(skb).fp->fp; + /* Socket ? */ + if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { + struct socket *sock = SOCKET_I(inode); + const struct proto_ops *ops; + struct sock *sk = sock->sk; - while (nfd--) { - /* Get the socket the fd matches if it indeed does so */ - struct unix_sock *u = unix_get_socket(*fp++); + ops = READ_ONCE(sock->ops); - /* Ignore non-candidates, they could have been added - * to the queues after starting the garbage collection - */ - if (u && test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { - hit = true; - - func(u); - } - } - if (hit && hitlist != NULL) { - __skb_unlink(skb, &x->sk_receive_queue); - __skb_queue_tail(hitlist, skb); - } - } + /* PF_UNIX ? */ + if (sk && ops && ops->family == PF_UNIX) + return unix_sk(sk); } - spin_unlock(&x->sk_receive_queue.lock); + + return NULL; } -static void scan_children(struct sock *x, void (*func)(struct unix_sock *), - struct sk_buff_head *hitlist) +static struct unix_vertex *unix_edge_successor(struct unix_edge *edge) { - if (x->sk_state != TCP_LISTEN) { - scan_inflight(x, func, hitlist); - } else { - struct sk_buff *skb; - struct sk_buff *next; - struct unix_sock *u; - LIST_HEAD(embryos); - - /* For a listening socket collect the queued embryos - * and perform a scan on them as well. - */ - spin_lock(&x->sk_receive_queue.lock); - skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { - u = unix_sk(skb->sk); - - /* An embryo cannot be in-flight, so it's safe - * to use the list link. - */ - BUG_ON(!list_empty(&u->link)); - list_add_tail(&u->link, &embryos); - } - spin_unlock(&x->sk_receive_queue.lock); - - while (!list_empty(&embryos)) { - u = list_entry(embryos.next, struct unix_sock, link); - scan_inflight(&u->sk, func, hitlist); - list_del_init(&u->link); - } - } -} - -static void dec_inflight(struct unix_sock *usk) -{ - usk->inflight--; -} - -static void inc_inflight(struct unix_sock *usk) -{ - usk->inflight++; -} - -static void inc_inflight_move_tail(struct unix_sock *u) -{ - u->inflight++; - - /* If this still might be part of a cycle, move it to the end - * of the list, so that it's checked even if it was already - * passed over + /* If an embryo socket has a fd, + * the listener indirectly holds the fd's refcnt. */ - if (test_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags)) - list_move_tail(&u->link, &gc_candidates); + if (edge->successor->listener) + return unix_sk(edge->successor->listener)->vertex; + + return edge->successor->vertex; +} + +static bool unix_graph_maybe_cyclic; +static bool unix_graph_grouped; + +static void unix_update_graph(struct unix_vertex *vertex) +{ + /* If the receiver socket is not inflight, no cyclic + * reference could be formed. + */ + if (!vertex) + return; + + unix_graph_maybe_cyclic = true; + unix_graph_grouped = false; +} + +static LIST_HEAD(unix_unvisited_vertices); + +enum unix_vertex_index { + UNIX_VERTEX_INDEX_MARK1, + UNIX_VERTEX_INDEX_MARK2, + UNIX_VERTEX_INDEX_START, +}; + +static unsigned long unix_vertex_unvisited_index = UNIX_VERTEX_INDEX_MARK1; + +static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) +{ + struct unix_vertex *vertex = edge->predecessor->vertex; + + if (!vertex) { + vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry); + vertex->index = unix_vertex_unvisited_index; + vertex->out_degree = 0; + INIT_LIST_HEAD(&vertex->edges); + INIT_LIST_HEAD(&vertex->scc_entry); + + list_move_tail(&vertex->entry, &unix_unvisited_vertices); + edge->predecessor->vertex = vertex; + } + + vertex->out_degree++; + list_add_tail(&edge->vertex_entry, &vertex->edges); + + unix_update_graph(unix_edge_successor(edge)); +} + +static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge) +{ + struct unix_vertex *vertex = edge->predecessor->vertex; + + if (!fpl->dead) + unix_update_graph(unix_edge_successor(edge)); + + list_del(&edge->vertex_entry); + vertex->out_degree--; + + if (!vertex->out_degree) { + edge->predecessor->vertex = NULL; + list_move_tail(&vertex->entry, &fpl->vertices); + } +} + +static void unix_free_vertices(struct scm_fp_list *fpl) +{ + struct unix_vertex *vertex, *next_vertex; + + list_for_each_entry_safe(vertex, next_vertex, &fpl->vertices, entry) { + list_del(&vertex->entry); + kfree(vertex); + } +} + +static DEFINE_SPINLOCK(unix_gc_lock); +unsigned int unix_tot_inflight; + +void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver) +{ + int i = 0, j = 0; + + spin_lock(&unix_gc_lock); + + if (!fpl->count_unix) + goto out; + + do { + struct unix_sock *inflight = unix_get_socket(fpl->fp[j++]); + struct unix_edge *edge; + + if (!inflight) + continue; + + edge = fpl->edges + i++; + edge->predecessor = inflight; + edge->successor = receiver; + + unix_add_edge(fpl, edge); + } while (i < fpl->count_unix); + + receiver->scm_stat.nr_unix_fds += fpl->count_unix; + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + fpl->count_unix); +out: + WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight + fpl->count); + + spin_unlock(&unix_gc_lock); + + fpl->inflight = true; + + unix_free_vertices(fpl); +} + +void unix_del_edges(struct scm_fp_list *fpl) +{ + struct unix_sock *receiver; + int i = 0; + + spin_lock(&unix_gc_lock); + + if (!fpl->count_unix) + goto out; + + do { + struct unix_edge *edge = fpl->edges + i++; + + unix_del_edge(fpl, edge); + } while (i < fpl->count_unix); + + if (!fpl->dead) { + receiver = fpl->edges[0].successor; + receiver->scm_stat.nr_unix_fds -= fpl->count_unix; + } + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - fpl->count_unix); +out: + WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count); + + spin_unlock(&unix_gc_lock); + + fpl->inflight = false; +} + +void unix_update_edges(struct unix_sock *receiver) +{ + /* nr_unix_fds is only updated under unix_state_lock(). + * If it's 0 here, the embryo socket is not part of the + * inflight graph, and GC will not see it, so no lock needed. + */ + if (!receiver->scm_stat.nr_unix_fds) { + receiver->listener = NULL; + } else { + spin_lock(&unix_gc_lock); + unix_update_graph(unix_sk(receiver->listener)->vertex); + receiver->listener = NULL; + spin_unlock(&unix_gc_lock); + } +} + +int unix_prepare_fpl(struct scm_fp_list *fpl) +{ + struct unix_vertex *vertex; + int i; + + if (!fpl->count_unix) + return 0; + + for (i = 0; i < fpl->count_unix; i++) { + vertex = kmalloc(sizeof(*vertex), GFP_KERNEL); + if (!vertex) + goto err; + + list_add(&vertex->entry, &fpl->vertices); + } + + fpl->edges = kvmalloc_array(fpl->count_unix, sizeof(*fpl->edges), + GFP_KERNEL_ACCOUNT); + if (!fpl->edges) + goto err; + + return 0; + +err: + unix_free_vertices(fpl); + return -ENOMEM; +} + +void unix_destroy_fpl(struct scm_fp_list *fpl) +{ + if (fpl->inflight) + unix_del_edges(fpl); + + kvfree(fpl->edges); + unix_free_vertices(fpl); +} + +static bool unix_vertex_dead(struct unix_vertex *vertex) +{ + struct unix_edge *edge; + struct unix_sock *u; + long total_ref; + + list_for_each_entry(edge, &vertex->edges, vertex_entry) { + struct unix_vertex *next_vertex = unix_edge_successor(edge); + + /* The vertex's fd can be received by a non-inflight socket. */ + if (!next_vertex) + return false; + + /* The vertex's fd can be received by an inflight socket in + * another SCC. + */ + if (next_vertex->scc_index != vertex->scc_index) + return false; + } + + /* No receiver exists out of the same SCC. */ + + edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry); + u = edge->predecessor; + total_ref = file_count(u->sk.sk_socket->file); + + /* If not close()d, total_ref > out_degree. */ + if (total_ref != vertex->out_degree) + return false; + + return true; +} + +enum unix_recv_queue_lock_class { + U_RECVQ_LOCK_NORMAL, + U_RECVQ_LOCK_EMBRYO, +}; + +static void unix_collect_queue(struct unix_sock *u, struct sk_buff_head *hitlist) +{ + skb_queue_splice_init(&u->sk.sk_receive_queue, hitlist); + +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + if (u->oob_skb) { + WARN_ON_ONCE(skb_unref(u->oob_skb)); + u->oob_skb = NULL; + } +#endif +} + +static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist) +{ + struct unix_vertex *vertex; + + list_for_each_entry_reverse(vertex, scc, scc_entry) { + struct sk_buff_head *queue; + struct unix_edge *edge; + struct unix_sock *u; + + edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry); + u = edge->predecessor; + queue = &u->sk.sk_receive_queue; + + spin_lock(&queue->lock); + + if (u->sk.sk_state == TCP_LISTEN) { + struct sk_buff *skb; + + skb_queue_walk(queue, skb) { + struct sk_buff_head *embryo_queue = &skb->sk->sk_receive_queue; + + /* listener -> embryo order, the inversion never happens. */ + spin_lock_nested(&embryo_queue->lock, U_RECVQ_LOCK_EMBRYO); + unix_collect_queue(unix_sk(skb->sk), hitlist); + spin_unlock(&embryo_queue->lock); + } + } else { + unix_collect_queue(u, hitlist); + } + + spin_unlock(&queue->lock); + } +} + +static bool unix_scc_cyclic(struct list_head *scc) +{ + struct unix_vertex *vertex; + struct unix_edge *edge; + + /* SCC containing multiple vertices ? */ + if (!list_is_singular(scc)) + return true; + + vertex = list_first_entry(scc, typeof(*vertex), scc_entry); + + /* Self-reference or a embryo-listener circle ? */ + list_for_each_entry(edge, &vertex->edges, vertex_entry) { + if (unix_edge_successor(edge) == vertex) + return true; + } + + return false; +} + +static LIST_HEAD(unix_visited_vertices); +static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2; + +static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index, + struct sk_buff_head *hitlist) +{ + LIST_HEAD(vertex_stack); + struct unix_edge *edge; + LIST_HEAD(edge_stack); + +next_vertex: + /* Push vertex to vertex_stack and mark it as on-stack + * (index >= UNIX_VERTEX_INDEX_START). + * The vertex will be popped when finalising SCC later. + */ + list_add(&vertex->scc_entry, &vertex_stack); + + vertex->index = *last_index; + vertex->scc_index = *last_index; + (*last_index)++; + + /* Explore neighbour vertices (receivers of the current vertex's fd). */ + list_for_each_entry(edge, &vertex->edges, vertex_entry) { + struct unix_vertex *next_vertex = unix_edge_successor(edge); + + if (!next_vertex) + continue; + + if (next_vertex->index == unix_vertex_unvisited_index) { + /* Iterative deepening depth first search + * + * 1. Push a forward edge to edge_stack and set + * the successor to vertex for the next iteration. + */ + list_add(&edge->stack_entry, &edge_stack); + + vertex = next_vertex; + goto next_vertex; + + /* 2. Pop the edge directed to the current vertex + * and restore the ancestor for backtracking. + */ +prev_vertex: + edge = list_first_entry(&edge_stack, typeof(*edge), stack_entry); + list_del_init(&edge->stack_entry); + + next_vertex = vertex; + vertex = edge->predecessor->vertex; + + /* If the successor has a smaller scc_index, two vertices + * are in the same SCC, so propagate the smaller scc_index + * to skip SCC finalisation. + */ + vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index); + } else if (next_vertex->index != unix_vertex_grouped_index) { + /* Loop detected by a back/cross edge. + * + * The successor is on vertex_stack, so two vertices are in + * the same SCC. If the successor has a smaller *scc_index*, + * propagate it to skip SCC finalisation. + */ + vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index); + } else { + /* The successor was already grouped as another SCC */ + } + } + + if (vertex->index == vertex->scc_index) { + struct unix_vertex *v; + struct list_head scc; + bool scc_dead = true; + + /* SCC finalised. + * + * If the scc_index was not updated, all the vertices above on + * vertex_stack are in the same SCC. Group them using scc_entry. + */ + __list_cut_position(&scc, &vertex_stack, &vertex->scc_entry); + + list_for_each_entry_reverse(v, &scc, scc_entry) { + /* Don't restart DFS from this vertex in unix_walk_scc(). */ + list_move_tail(&v->entry, &unix_visited_vertices); + + /* Mark vertex as off-stack. */ + v->index = unix_vertex_grouped_index; + + if (scc_dead) + scc_dead = unix_vertex_dead(v); + } + + if (scc_dead) + unix_collect_skb(&scc, hitlist); + else if (!unix_graph_maybe_cyclic) + unix_graph_maybe_cyclic = unix_scc_cyclic(&scc); + + list_del(&scc); + } + + /* Need backtracking ? */ + if (!list_empty(&edge_stack)) + goto prev_vertex; +} + +static void unix_walk_scc(struct sk_buff_head *hitlist) +{ + unsigned long last_index = UNIX_VERTEX_INDEX_START; + + unix_graph_maybe_cyclic = false; + + /* Visit every vertex exactly once. + * __unix_walk_scc() moves visited vertices to unix_visited_vertices. + */ + while (!list_empty(&unix_unvisited_vertices)) { + struct unix_vertex *vertex; + + vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); + __unix_walk_scc(vertex, &last_index, hitlist); + } + + list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); + swap(unix_vertex_unvisited_index, unix_vertex_grouped_index); + + unix_graph_grouped = true; +} + +static void unix_walk_scc_fast(struct sk_buff_head *hitlist) +{ + unix_graph_maybe_cyclic = false; + + while (!list_empty(&unix_unvisited_vertices)) { + struct unix_vertex *vertex; + struct list_head scc; + bool scc_dead = true; + + vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); + list_add(&scc, &vertex->scc_entry); + + list_for_each_entry_reverse(vertex, &scc, scc_entry) { + list_move_tail(&vertex->entry, &unix_visited_vertices); + + if (scc_dead) + scc_dead = unix_vertex_dead(vertex); + } + + if (scc_dead) + unix_collect_skb(&scc, hitlist); + else if (!unix_graph_maybe_cyclic) + unix_graph_maybe_cyclic = unix_scc_cyclic(&scc); + + list_del(&scc); + } + + list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); } static bool gc_in_progress; static void __unix_gc(struct work_struct *work) { - struct sk_buff *next_skb, *skb; - struct unix_sock *u; - struct unix_sock *next; struct sk_buff_head hitlist; - struct list_head cursor; - LIST_HEAD(not_cycle_list); + struct sk_buff *skb; spin_lock(&unix_gc_lock); - /* First, select candidates for garbage collection. Only - * in-flight sockets are considered, and from those only ones - * which don't have any external reference. - * - * Holding unix_gc_lock will protect these candidates from - * being detached, and hence from gaining an external - * reference. Since there are no possible receivers, all - * buffers currently on the candidates' queues stay there - * during the garbage collection. - * - * We also know that no new candidate can be added onto the - * receive queues. Other, non candidate sockets _can_ be - * added to queue, so we must make sure only to touch - * candidates. - * - * Embryos, though never candidates themselves, affect which - * candidates are reachable by the garbage collector. Before - * being added to a listener's queue, an embryo may already - * receive data carrying SCM_RIGHTS, potentially making the - * passed socket a candidate that is not yet reachable by the - * collector. It becomes reachable once the embryo is - * enqueued. Therefore, we must ensure that no SCM-laden - * embryo appears in a (candidate) listener's queue between - * consecutive scan_children() calls. - */ - list_for_each_entry_safe(u, next, &gc_inflight_list, link) { - struct sock *sk = &u->sk; - long total_refs; - - total_refs = file_count(sk->sk_socket->file); - - BUG_ON(!u->inflight); - BUG_ON(total_refs < u->inflight); - if (total_refs == u->inflight) { - list_move_tail(&u->link, &gc_candidates); - __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags); - __set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); - - if (sk->sk_state == TCP_LISTEN) { - unix_state_lock_nested(sk, U_LOCK_GC_LISTENER); - unix_state_unlock(sk); - } - } + if (!unix_graph_maybe_cyclic) { + spin_unlock(&unix_gc_lock); + goto skip_gc; } - /* Now remove all internal in-flight reference to children of - * the candidates. - */ - list_for_each_entry(u, &gc_candidates, link) - scan_children(&u->sk, dec_inflight, NULL); + __skb_queue_head_init(&hitlist); - /* Restore the references for children of all candidates, - * which have remaining references. Do this recursively, so - * only those remain, which form cyclic references. - * - * Use a "cursor" link, to make the list traversal safe, even - * though elements might be moved about. - */ - list_add(&cursor, &gc_candidates); - while (cursor.next != &gc_candidates) { - u = list_entry(cursor.next, struct unix_sock, link); - - /* Move cursor to after the current position. */ - list_move(&cursor, &u->link); - - if (u->inflight) { - list_move_tail(&u->link, ¬_cycle_list); - __clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); - scan_children(&u->sk, inc_inflight_move_tail, NULL); - } - } - list_del(&cursor); - - /* Now gc_candidates contains only garbage. Restore original - * inflight counters for these as well, and remove the skbuffs - * which are creating the cycle(s). - */ - skb_queue_head_init(&hitlist); - list_for_each_entry(u, &gc_candidates, link) { - scan_children(&u->sk, inc_inflight, &hitlist); - -#if IS_ENABLED(CONFIG_AF_UNIX_OOB) - if (u->oob_skb) { - kfree_skb(u->oob_skb); - u->oob_skb = NULL; - } -#endif - } - - /* not_cycle_list contains those sockets which do not make up a - * cycle. Restore these to the inflight list. - */ - while (!list_empty(¬_cycle_list)) { - u = list_entry(not_cycle_list.next, struct unix_sock, link); - __clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags); - list_move_tail(&u->link, &gc_inflight_list); - } + if (unix_graph_grouped) + unix_walk_scc_fast(&hitlist); + else + unix_walk_scc(&hitlist); spin_unlock(&unix_gc_lock); - /* We need io_uring to clean its registered files, ignore all io_uring - * originated skbs. It's fine as io_uring doesn't keep references to - * other io_uring instances and so killing all other files in the cycle - * will put all io_uring references forcing it to go through normal - * release.path eventually putting registered files. - */ - skb_queue_walk_safe(&hitlist, skb, next_skb) { - if (skb->destructor == io_uring_destruct_scm) { - __skb_unlink(skb, &hitlist); - skb_queue_tail(&skb->sk->sk_receive_queue, skb); - } + skb_queue_walk(&hitlist, skb) { + if (UNIXCB(skb).fp) + UNIXCB(skb).fp->dead = true; } - /* Here we are. Hitlist is filled. Die. */ __skb_queue_purge(&hitlist); - - spin_lock(&unix_gc_lock); - - /* There could be io_uring registered files, just push them back to - * the inflight list - */ - list_for_each_entry_safe(u, next, &gc_candidates, link) - list_move_tail(&u->link, &gc_inflight_list); - - /* All candidates should have been detached by now. */ - BUG_ON(!list_empty(&gc_candidates)); - - /* Paired with READ_ONCE() in wait_for_unix_gc(). */ +skip_gc: WRITE_ONCE(gc_in_progress, false); - - spin_unlock(&unix_gc_lock); } static DECLARE_WORK(unix_gc_work, __unix_gc); @@ -335,8 +605,9 @@ void unix_gc(void) } #define UNIX_INFLIGHT_TRIGGER_GC 16000 +#define UNIX_INFLIGHT_SANE_USER (SCM_MAX_FD * 8) -void wait_for_unix_gc(void) +void wait_for_unix_gc(struct scm_fp_list *fpl) { /* If number of inflight sockets is insane, * force a garbage collect right now. @@ -348,6 +619,13 @@ void wait_for_unix_gc(void) !READ_ONCE(gc_in_progress)) unix_gc(); + /* Penalise users who want to send AF_UNIX sockets + * but whose sockets have not been received yet. + */ + if (!fpl || !fpl->count_unix || + READ_ONCE(fpl->user->unix_inflight) < UNIX_INFLIGHT_SANE_USER) + return; + if (READ_ONCE(gc_in_progress)) flush_work(&unix_gc_work); } diff --git a/net/unix/scm.c b/net/unix/scm.c deleted file mode 100644 index b5ae5ab16777..000000000000 --- a/net/unix/scm.c +++ /dev/null @@ -1,156 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "scm.h" - -unsigned int unix_tot_inflight; -EXPORT_SYMBOL(unix_tot_inflight); - -LIST_HEAD(gc_inflight_list); -EXPORT_SYMBOL(gc_inflight_list); - -DEFINE_SPINLOCK(unix_gc_lock); -EXPORT_SYMBOL(unix_gc_lock); - -struct unix_sock *unix_get_socket(struct file *filp) -{ - struct inode *inode = file_inode(filp); - - /* Socket ? */ - if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { - struct socket *sock = SOCKET_I(inode); - const struct proto_ops *ops = READ_ONCE(sock->ops); - struct sock *s = sock->sk; - - /* PF_UNIX ? */ - if (s && ops && ops->family == PF_UNIX) - return unix_sk(s); - } - - return NULL; -} -EXPORT_SYMBOL(unix_get_socket); - -/* Keep the number of times in flight count for the file - * descriptor if it is for an AF_UNIX socket. - */ -void unix_inflight(struct user_struct *user, struct file *fp) -{ - struct unix_sock *u = unix_get_socket(fp); - - spin_lock(&unix_gc_lock); - - if (u) { - if (!u->inflight) { - BUG_ON(!list_empty(&u->link)); - list_add_tail(&u->link, &gc_inflight_list); - } else { - BUG_ON(list_empty(&u->link)); - } - u->inflight++; - /* Paired with READ_ONCE() in wait_for_unix_gc() */ - WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1); - } - WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1); - spin_unlock(&unix_gc_lock); -} - -void unix_notinflight(struct user_struct *user, struct file *fp) -{ - struct unix_sock *u = unix_get_socket(fp); - - spin_lock(&unix_gc_lock); - - if (u) { - BUG_ON(!u->inflight); - BUG_ON(list_empty(&u->link)); - - u->inflight--; - if (!u->inflight) - list_del_init(&u->link); - /* Paired with READ_ONCE() in wait_for_unix_gc() */ - WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1); - } - WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1); - spin_unlock(&unix_gc_lock); -} - -/* - * The "user->unix_inflight" variable is protected by the garbage - * collection lock, and we just read it locklessly here. If you go - * over the limit, there might be a tiny race in actually noticing - * it across threads. Tough. - */ -static inline bool too_many_unix_fds(struct task_struct *p) -{ - struct user_struct *user = current_user(); - - if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) - return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); - return false; -} - -int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) -{ - int i; - - if (too_many_unix_fds(current)) - return -ETOOMANYREFS; - - /* - * Need to duplicate file references for the sake of garbage - * collection. Otherwise a socket in the fps might become a - * candidate for GC while the skb is not yet queued. - */ - UNIXCB(skb).fp = scm_fp_dup(scm->fp); - if (!UNIXCB(skb).fp) - return -ENOMEM; - - for (i = scm->fp->count - 1; i >= 0; i--) - unix_inflight(scm->fp->user, scm->fp->fp[i]); - return 0; -} -EXPORT_SYMBOL(unix_attach_fds); - -void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) -{ - int i; - - scm->fp = UNIXCB(skb).fp; - UNIXCB(skb).fp = NULL; - - for (i = scm->fp->count-1; i >= 0; i--) - unix_notinflight(scm->fp->user, scm->fp->fp[i]); -} -EXPORT_SYMBOL(unix_detach_fds); - -void unix_destruct_scm(struct sk_buff *skb) -{ - struct scm_cookie scm; - - memset(&scm, 0, sizeof(scm)); - scm.pid = UNIXCB(skb).pid; - if (UNIXCB(skb).fp) - unix_detach_fds(&scm, skb); - - /* Alas, it calls VFS */ - /* So fscking what? fput() had been SMP-safe since the last Summer */ - scm_destroy(&scm); - sock_wfree(skb); -} -EXPORT_SYMBOL(unix_destruct_scm); - -void io_uring_destruct_scm(struct sk_buff *skb) -{ - unix_destruct_scm(skb); -} -EXPORT_SYMBOL(io_uring_destruct_scm); diff --git a/net/unix/scm.h b/net/unix/scm.h deleted file mode 100644 index 5a255a477f16..000000000000 --- a/net/unix/scm.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef NET_UNIX_SCM_H -#define NET_UNIX_SCM_H - -extern struct list_head gc_inflight_list; -extern spinlock_t unix_gc_lock; - -int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb); -void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb); - -#endif diff --git a/samples/ftrace/sample-trace-array.c b/samples/ftrace/sample-trace-array.c index 77685a7eb767..2cb61492102d 100644 --- a/samples/ftrace/sample-trace-array.c +++ b/samples/ftrace/sample-trace-array.c @@ -105,7 +105,8 @@ static int __init sample_trace_array_init(void) * NOTE: This function increments the reference counter * associated with the trace array - "tr". */ - tr = trace_array_get_by_name("sample-instance"); + tr = trace_array_get_by_name_ext("sample-instance", + "sched,timer,kprobes"); if (!tr) return -1; diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index dff3be23488b..646b0c4e0244 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -88,6 +88,76 @@ static void write_debugfs(const char *fmt, ...) } } +static char *allocate_zero_filled_hugepage(size_t len) +{ + char *result; + size_t i; + + result = memalign(pmd_pagesize, len); + if (!result) { + printf("Fail to allocate memory\n"); + exit(EXIT_FAILURE); + } + + madvise(result, len, MADV_HUGEPAGE); + + for (i = 0; i < len; i++) + result[i] = (char)0; + + return result; +} + +static void verify_rss_anon_split_huge_page_all_zeroes(char *one_page, int nr_hpages, size_t len) +{ + unsigned long rss_anon_before, rss_anon_after; + size_t i; + + if (!check_huge_anon(one_page, 4, pmd_pagesize)) { + printf("No THP is allocated\n"); + exit(EXIT_FAILURE); + } + + rss_anon_before = rss_anon(); + if (!rss_anon_before) { + printf("No RssAnon is allocated before split\n"); + exit(EXIT_FAILURE); + } + + /* split all THPs */ + write_debugfs(PID_FMT, getpid(), (uint64_t)one_page, + (uint64_t)one_page + len, 0); + + for (i = 0; i < len; i++) + if (one_page[i] != (char)0) { + printf("%ld byte corrupted\n", i); + exit(EXIT_FAILURE); + } + + if (!check_huge_anon(one_page, 0, pmd_pagesize)) { + printf("Still AnonHugePages not split\n"); + exit(EXIT_FAILURE); + } + + rss_anon_after = rss_anon(); + if (rss_anon_after >= rss_anon_before) { + printf("Incorrect RssAnon value. Before: %ld After: %ld\n", + rss_anon_before, rss_anon_after); + exit(EXIT_FAILURE); + } +} + +void split_pmd_zero_pages(void) +{ + char *one_page; + int nr_hpages = 4; + size_t len = nr_hpages * pmd_pagesize; + + one_page = allocate_zero_filled_hugepage(len); + verify_rss_anon_split_huge_page_all_zeroes(one_page, nr_hpages, len); + printf("Split zero filled huge pages successful\n"); + free(one_page); +} + void split_pmd_thp(void) { char *one_page; @@ -305,6 +375,7 @@ int main(int argc, char **argv) exit(EXIT_FAILURE); } + split_pmd_zero_pages(); split_pmd_thp(); split_pte_mapped_thp(); split_file_backed_thp(); diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index 558c9cd8901c..598826ebce8f 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -11,6 +11,7 @@ #define PMD_SIZE_FILE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" #define SMAP_FILE_PATH "/proc/self/smaps" +#define STATUS_FILE_PATH "/proc/self/status" #define MAX_LINE_LENGTH 500 unsigned int __page_size; @@ -97,6 +98,27 @@ uint64_t read_pmd_pagesize(void) return strtoul(buf, NULL, 10); } +unsigned long rss_anon(void) +{ + unsigned long rss_anon = 0; + FILE *fp; + char buffer[MAX_LINE_LENGTH]; + + fp = fopen(STATUS_FILE_PATH, "r"); + if (!fp) + ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, STATUS_FILE_PATH); + + if (!check_for_pattern(fp, "RssAnon:", buffer, sizeof(buffer))) + goto err_out; + + if (sscanf(buffer, "RssAnon:%10lu kB", &rss_anon) != 1) + ksft_exit_fail_msg("Reading status error\n"); + +err_out: + fclose(fp); + return rss_anon; +} + bool __check_huge(void *addr, char *pattern, int nr_hpages, uint64_t hpage_size) { diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index 0c603bec5e20..65bee2e5624e 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -39,6 +39,7 @@ unsigned long pagemap_get_pfn(int fd, char *start); void clear_softdirty(void); bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len); uint64_t read_pmd_pagesize(void); +unsigned long rss_anon(void); bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size); bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size); bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size);