bpf, sockmap: Fix panic when calling skb_linearize
[ Upstream commit 5ca2e29f6834c64c0e5a9ccf1278c21fb49b827e ]
The panic can be reproduced by executing the command:
./bench sockmap -c 2 -p 1 -a --rx-verdict-ingress --rx-strp 100000
Then a kernel panic was captured:
'''
[ 657.460555] kernel BUG at net/core/skbuff.c:2178!
[ 657.462680] Tainted: [W]=WARN
[ 657.463287] Workqueue: events sk_psock_backlog
...
[ 657.469610] <TASK>
[ 657.469738] ? die+0x36/0x90
[ 657.469916] ? do_trap+0x1d0/0x270
[ 657.470118] ? pskb_expand_head+0x612/0xf40
[ 657.470376] ? pskb_expand_head+0x612/0xf40
[ 657.470620] ? do_error_trap+0xa3/0x170
[ 657.470846] ? pskb_expand_head+0x612/0xf40
[ 657.471092] ? handle_invalid_op+0x2c/0x40
[ 657.471335] ? pskb_expand_head+0x612/0xf40
[ 657.471579] ? exc_invalid_op+0x2d/0x40
[ 657.471805] ? asm_exc_invalid_op+0x1a/0x20
[ 657.472052] ? pskb_expand_head+0xd1/0xf40
[ 657.472292] ? pskb_expand_head+0x612/0xf40
[ 657.472540] ? lock_acquire+0x18f/0x4e0
[ 657.472766] ? find_held_lock+0x2d/0x110
[ 657.472999] ? __pfx_pskb_expand_head+0x10/0x10
[ 657.473263] ? __kmalloc_cache_noprof+0x5b/0x470
[ 657.473537] ? __pfx___lock_release.isra.0+0x10/0x10
[ 657.473826] __pskb_pull_tail+0xfd/0x1d20
[ 657.474062] ? __kasan_slab_alloc+0x4e/0x90
[ 657.474707] sk_psock_skb_ingress_enqueue+0x3bf/0x510
[ 657.475392] ? __kasan_kmalloc+0xaa/0xb0
[ 657.476010] sk_psock_backlog+0x5cf/0xd70
[ 657.476637] process_one_work+0x858/0x1a20
'''
The panic originates from the assertion BUG_ON(skb_shared(skb)) in
skb_linearize(). A previous commit(see Fixes tag) introduced skb_get()
to avoid race conditions between skb operations in the backlog and skb
release in the recvmsg path. However, this caused the panic to always
occur when skb_linearize is executed.
The "--rx-strp 100000" parameter forces the RX path to use the strparser
module which aggregates data until it reaches 100KB before calling sockmap
logic. The 100KB payload exceeds MAX_MSG_FRAGS, triggering skb_linearize.
To fix this issue, just move skb_get into sk_psock_skb_ingress_enqueue.
'''
sk_psock_backlog:
sk_psock_handle_skb
skb_get(skb) <== we move it into 'sk_psock_skb_ingress_enqueue'
sk_psock_skb_ingress____________
↓
|
| → sk_psock_skb_ingress_self
| sk_psock_skb_ingress_enqueue
sk_psock_verdict_apply_________________↑ skb_linearize
'''
Note that for verdict_apply path, the skb_get operation is unnecessary so
we add 'take_ref' param to control it's behavior.
Fixes: a454d84ee2
("bpf, sockmap: Fix skb refcnt race after locking changes")
Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev>
Link: https://lore.kernel.org/r/20250407142234.47591-4-jiayuan.chen@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
This commit is contained in:
committed by
Greg Kroah-Hartman
parent
34837ae8cd
commit
db1d15a26f
@@ -529,16 +529,22 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
|
||||
u32 off, u32 len,
|
||||
struct sk_psock *psock,
|
||||
struct sock *sk,
|
||||
struct sk_msg *msg)
|
||||
struct sk_msg *msg,
|
||||
bool take_ref)
|
||||
{
|
||||
int num_sge, copied;
|
||||
|
||||
/* skb_to_sgvec will fail when the total number of fragments in
|
||||
* frag_list and frags exceeds MAX_MSG_FRAGS. For example, the
|
||||
* caller may aggregate multiple skbs.
|
||||
*/
|
||||
num_sge = skb_to_sgvec(skb, msg->sg.data, off, len);
|
||||
if (num_sge < 0) {
|
||||
/* skb linearize may fail with ENOMEM, but lets simply try again
|
||||
* later if this happens. Under memory pressure we don't want to
|
||||
* drop the skb. We need to linearize the skb so that the mapping
|
||||
* in skb_to_sgvec can not error.
|
||||
* Note that skb_linearize requires the skb not to be shared.
|
||||
*/
|
||||
if (skb_linearize(skb))
|
||||
return -EAGAIN;
|
||||
@@ -555,7 +561,7 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
|
||||
msg->sg.start = 0;
|
||||
msg->sg.size = copied;
|
||||
msg->sg.end = num_sge;
|
||||
msg->skb = skb;
|
||||
msg->skb = take_ref ? skb_get(skb) : skb;
|
||||
|
||||
sk_psock_queue_msg(psock, msg);
|
||||
sk_psock_data_ready(sk, psock);
|
||||
@@ -563,7 +569,7 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
|
||||
}
|
||||
|
||||
static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb,
|
||||
u32 off, u32 len);
|
||||
u32 off, u32 len, bool take_ref);
|
||||
|
||||
static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
|
||||
u32 off, u32 len)
|
||||
@@ -577,7 +583,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
|
||||
* correctly.
|
||||
*/
|
||||
if (unlikely(skb->sk == sk))
|
||||
return sk_psock_skb_ingress_self(psock, skb, off, len);
|
||||
return sk_psock_skb_ingress_self(psock, skb, off, len, true);
|
||||
msg = sk_psock_create_ingress_msg(sk, skb);
|
||||
if (!msg)
|
||||
return -EAGAIN;
|
||||
@@ -589,7 +595,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
|
||||
* into user buffers.
|
||||
*/
|
||||
skb_set_owner_r(skb, sk);
|
||||
err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg);
|
||||
err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, true);
|
||||
if (err < 0)
|
||||
kfree(msg);
|
||||
return err;
|
||||
@@ -600,7 +606,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
|
||||
* because the skb is already accounted for here.
|
||||
*/
|
||||
static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb,
|
||||
u32 off, u32 len)
|
||||
u32 off, u32 len, bool take_ref)
|
||||
{
|
||||
struct sk_msg *msg = alloc_sk_msg(GFP_ATOMIC);
|
||||
struct sock *sk = psock->sk;
|
||||
@@ -609,7 +615,7 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb
|
||||
if (unlikely(!msg))
|
||||
return -EAGAIN;
|
||||
skb_set_owner_r(skb, sk);
|
||||
err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg);
|
||||
err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, take_ref);
|
||||
if (err < 0)
|
||||
kfree(msg);
|
||||
return err;
|
||||
@@ -618,18 +624,13 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb
|
||||
static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
|
||||
u32 off, u32 len, bool ingress)
|
||||
{
|
||||
int err = 0;
|
||||
|
||||
if (!ingress) {
|
||||
if (!sock_writeable(psock->sk))
|
||||
return -EAGAIN;
|
||||
return skb_send_sock(psock->sk, skb, off, len);
|
||||
}
|
||||
skb_get(skb);
|
||||
err = sk_psock_skb_ingress(psock, skb, off, len);
|
||||
if (err < 0)
|
||||
kfree_skb(skb);
|
||||
return err;
|
||||
|
||||
return sk_psock_skb_ingress(psock, skb, off, len);
|
||||
}
|
||||
|
||||
static void sk_psock_skb_state(struct sk_psock *psock,
|
||||
@@ -1017,7 +1018,7 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb,
|
||||
off = stm->offset;
|
||||
len = stm->full_len;
|
||||
}
|
||||
err = sk_psock_skb_ingress_self(psock, skb, off, len);
|
||||
err = sk_psock_skb_ingress_self(psock, skb, off, len, false);
|
||||
}
|
||||
if (err < 0) {
|
||||
spin_lock_bh(&psock->ingress_lock);
|
||||
|
Reference in New Issue
Block a user