bpf, sockmap: Fix panic when calling skb_linearize

[ Upstream commit 5ca2e29f6834c64c0e5a9ccf1278c21fb49b827e ]

The panic can be reproduced by executing the command:
./bench sockmap -c 2 -p 1 -a --rx-verdict-ingress --rx-strp 100000

Then a kernel panic was captured:
'''
[  657.460555] kernel BUG at net/core/skbuff.c:2178!
[  657.462680] Tainted: [W]=WARN
[  657.463287] Workqueue: events sk_psock_backlog
...
[  657.469610]  <TASK>
[  657.469738]  ? die+0x36/0x90
[  657.469916]  ? do_trap+0x1d0/0x270
[  657.470118]  ? pskb_expand_head+0x612/0xf40
[  657.470376]  ? pskb_expand_head+0x612/0xf40
[  657.470620]  ? do_error_trap+0xa3/0x170
[  657.470846]  ? pskb_expand_head+0x612/0xf40
[  657.471092]  ? handle_invalid_op+0x2c/0x40
[  657.471335]  ? pskb_expand_head+0x612/0xf40
[  657.471579]  ? exc_invalid_op+0x2d/0x40
[  657.471805]  ? asm_exc_invalid_op+0x1a/0x20
[  657.472052]  ? pskb_expand_head+0xd1/0xf40
[  657.472292]  ? pskb_expand_head+0x612/0xf40
[  657.472540]  ? lock_acquire+0x18f/0x4e0
[  657.472766]  ? find_held_lock+0x2d/0x110
[  657.472999]  ? __pfx_pskb_expand_head+0x10/0x10
[  657.473263]  ? __kmalloc_cache_noprof+0x5b/0x470
[  657.473537]  ? __pfx___lock_release.isra.0+0x10/0x10
[  657.473826]  __pskb_pull_tail+0xfd/0x1d20
[  657.474062]  ? __kasan_slab_alloc+0x4e/0x90
[  657.474707]  sk_psock_skb_ingress_enqueue+0x3bf/0x510
[  657.475392]  ? __kasan_kmalloc+0xaa/0xb0
[  657.476010]  sk_psock_backlog+0x5cf/0xd70
[  657.476637]  process_one_work+0x858/0x1a20
'''

The panic originates from the assertion BUG_ON(skb_shared(skb)) in
skb_linearize(). A previous commit(see Fixes tag) introduced skb_get()
to avoid race conditions between skb operations in the backlog and skb
release in the recvmsg path. However, this caused the panic to always
occur when skb_linearize is executed.

The "--rx-strp 100000" parameter forces the RX path to use the strparser
module which aggregates data until it reaches 100KB before calling sockmap
logic. The 100KB payload exceeds MAX_MSG_FRAGS, triggering skb_linearize.

To fix this issue, just move skb_get into sk_psock_skb_ingress_enqueue.

'''
sk_psock_backlog:
    sk_psock_handle_skb
       skb_get(skb) <== we move it into 'sk_psock_skb_ingress_enqueue'
       sk_psock_skb_ingress____________
                                       ↓
                                       |
                                       | → sk_psock_skb_ingress_self
                                       |      sk_psock_skb_ingress_enqueue
sk_psock_verdict_apply_________________↑          skb_linearize
'''

Note that for verdict_apply path, the skb_get operation is unnecessary so
we add 'take_ref' param to control it's behavior.

Fixes: a454d84ee2 ("bpf, sockmap: Fix skb refcnt race after locking changes")
Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev>
Link: https://lore.kernel.org/r/20250407142234.47591-4-jiayuan.chen@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
This commit is contained in:
Jiayuan Chen
2025-04-07 22:21:22 +08:00
committed by Greg Kroah-Hartman
parent 34837ae8cd
commit db1d15a26f

View File

@@ -529,16 +529,22 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
u32 off, u32 len, u32 off, u32 len,
struct sk_psock *psock, struct sk_psock *psock,
struct sock *sk, struct sock *sk,
struct sk_msg *msg) struct sk_msg *msg,
bool take_ref)
{ {
int num_sge, copied; int num_sge, copied;
/* skb_to_sgvec will fail when the total number of fragments in
* frag_list and frags exceeds MAX_MSG_FRAGS. For example, the
* caller may aggregate multiple skbs.
*/
num_sge = skb_to_sgvec(skb, msg->sg.data, off, len); num_sge = skb_to_sgvec(skb, msg->sg.data, off, len);
if (num_sge < 0) { if (num_sge < 0) {
/* skb linearize may fail with ENOMEM, but lets simply try again /* skb linearize may fail with ENOMEM, but lets simply try again
* later if this happens. Under memory pressure we don't want to * later if this happens. Under memory pressure we don't want to
* drop the skb. We need to linearize the skb so that the mapping * drop the skb. We need to linearize the skb so that the mapping
* in skb_to_sgvec can not error. * in skb_to_sgvec can not error.
* Note that skb_linearize requires the skb not to be shared.
*/ */
if (skb_linearize(skb)) if (skb_linearize(skb))
return -EAGAIN; return -EAGAIN;
@@ -555,7 +561,7 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
msg->sg.start = 0; msg->sg.start = 0;
msg->sg.size = copied; msg->sg.size = copied;
msg->sg.end = num_sge; msg->sg.end = num_sge;
msg->skb = skb; msg->skb = take_ref ? skb_get(skb) : skb;
sk_psock_queue_msg(psock, msg); sk_psock_queue_msg(psock, msg);
sk_psock_data_ready(sk, psock); sk_psock_data_ready(sk, psock);
@@ -563,7 +569,7 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
} }
static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb,
u32 off, u32 len); u32 off, u32 len, bool take_ref);
static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
u32 off, u32 len) u32 off, u32 len)
@@ -577,7 +583,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
* correctly. * correctly.
*/ */
if (unlikely(skb->sk == sk)) if (unlikely(skb->sk == sk))
return sk_psock_skb_ingress_self(psock, skb, off, len); return sk_psock_skb_ingress_self(psock, skb, off, len, true);
msg = sk_psock_create_ingress_msg(sk, skb); msg = sk_psock_create_ingress_msg(sk, skb);
if (!msg) if (!msg)
return -EAGAIN; return -EAGAIN;
@@ -589,7 +595,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
* into user buffers. * into user buffers.
*/ */
skb_set_owner_r(skb, sk); skb_set_owner_r(skb, sk);
err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg); err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, true);
if (err < 0) if (err < 0)
kfree(msg); kfree(msg);
return err; return err;
@@ -600,7 +606,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
* because the skb is already accounted for here. * because the skb is already accounted for here.
*/ */
static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb,
u32 off, u32 len) u32 off, u32 len, bool take_ref)
{ {
struct sk_msg *msg = alloc_sk_msg(GFP_ATOMIC); struct sk_msg *msg = alloc_sk_msg(GFP_ATOMIC);
struct sock *sk = psock->sk; struct sock *sk = psock->sk;
@@ -609,7 +615,7 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb
if (unlikely(!msg)) if (unlikely(!msg))
return -EAGAIN; return -EAGAIN;
skb_set_owner_r(skb, sk); skb_set_owner_r(skb, sk);
err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg); err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, take_ref);
if (err < 0) if (err < 0)
kfree(msg); kfree(msg);
return err; return err;
@@ -618,18 +624,13 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb
static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
u32 off, u32 len, bool ingress) u32 off, u32 len, bool ingress)
{ {
int err = 0;
if (!ingress) { if (!ingress) {
if (!sock_writeable(psock->sk)) if (!sock_writeable(psock->sk))
return -EAGAIN; return -EAGAIN;
return skb_send_sock(psock->sk, skb, off, len); return skb_send_sock(psock->sk, skb, off, len);
} }
skb_get(skb);
err = sk_psock_skb_ingress(psock, skb, off, len); return sk_psock_skb_ingress(psock, skb, off, len);
if (err < 0)
kfree_skb(skb);
return err;
} }
static void sk_psock_skb_state(struct sk_psock *psock, static void sk_psock_skb_state(struct sk_psock *psock,
@@ -1017,7 +1018,7 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb,
off = stm->offset; off = stm->offset;
len = stm->full_len; len = stm->full_len;
} }
err = sk_psock_skb_ingress_self(psock, skb, off, len); err = sk_psock_skb_ingress_self(psock, skb, off, len, false);
} }
if (err < 0) { if (err < 0) {
spin_lock_bh(&psock->ingress_lock); spin_lock_bh(&psock->ingress_lock);