From 0766162b0f3854d6a80b4f9e77aff7796a760a8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Thu, 31 Jul 2025 14:00:56 -0400 Subject: [PATCH 1/7] sunrpc: fix client side handling of tls alerts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-131164 cve CVE-2025-38571 commit-author Olga Kornievskaia commit cc5d59081fa26506d02de2127ab822f40d88bc5a upstream-diff Resolved context conflicts from missing de4eda9de2d957ef2d6a8365a01e26a435e958cb A security exploit was discovered in NFS over TLS in tls_alert_recv due to its assumption that there is valid data in the msghdr's iterator's kvec. Instead, this patch proposes the rework how control messages are setup and used by sock_recvmsg(). If no control message structure is setup, kTLS layer will read and process TLS data record types. As soon as it encounters a TLS control message, it would return an error. At that point, NFS can setup a kvec backed control buffer and read in the control message such as a TLS alert. Scott found that a msg iterator can advance the kvec pointer as a part of the copy process thus we need to revert the iterator before calling into the tls_alert_recv. Fixes: dea034b963c8 ("SUNRPC: Capture CMSG metadata on client-side receive") Suggested-by: Trond Myklebust Suggested-by: Scott Mayhew Signed-off-by: Olga Kornievskaia Link: https://lore.kernel.org/r/20250731180058.4669-3-okorniev@redhat.com Signed-off-by: Trond Myklebust (cherry picked from commit cc5d59081fa26506d02de2127ab822f40d88bc5a) Signed-off-by: Marcin Wcisło --- net/sunrpc/xprtsock.c | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index ab2d5a85fd79a..41c395f140b83 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -367,7 +367,7 @@ xs_alloc_sparse_pages(struct xdr_buf *buf, size_t want, gfp_t gfp) static int xs_sock_process_cmsg(struct socket *sock, struct msghdr *msg, - struct cmsghdr *cmsg, int ret) + unsigned int *msg_flags, struct cmsghdr *cmsg, int ret) { u8 content_type = tls_get_record_type(sock->sk, cmsg); u8 level, description; @@ -380,7 +380,7 @@ xs_sock_process_cmsg(struct socket *sock, struct msghdr *msg, * record, even though there might be more frames * waiting to be decrypted. */ - msg->msg_flags &= ~MSG_EOR; + *msg_flags &= ~MSG_EOR; break; case TLS_RECORD_TYPE_ALERT: tls_alert_recv(sock->sk, msg, &level, &description); @@ -395,19 +395,33 @@ xs_sock_process_cmsg(struct socket *sock, struct msghdr *msg, } static int -xs_sock_recv_cmsg(struct socket *sock, struct msghdr *msg, int flags) +xs_sock_recv_cmsg(struct socket *sock, unsigned int *msg_flags, int flags) { union { struct cmsghdr cmsg; u8 buf[CMSG_SPACE(sizeof(u8))]; } u; + u8 alert[2]; + struct kvec alert_kvec = { + .iov_base = alert, + .iov_len = sizeof(alert), + }; + struct msghdr msg = { + .msg_flags = *msg_flags, + .msg_control = &u, + .msg_controllen = sizeof(u), + }; int ret; - msg->msg_control = &u; - msg->msg_controllen = sizeof(u); - ret = sock_recvmsg(sock, msg, flags); - if (msg->msg_controllen != sizeof(u)) - ret = xs_sock_process_cmsg(sock, msg, &u.cmsg, ret); + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec, 1, + alert_kvec.iov_len); + ret = sock_recvmsg(sock, &msg, flags); + if (ret > 0 && + tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) { + iov_iter_revert(&msg.msg_iter, ret); + ret = xs_sock_process_cmsg(sock, &msg, msg_flags, &u.cmsg, + -EAGAIN); + } return ret; } @@ -417,7 +431,13 @@ xs_sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags, size_t seek) ssize_t ret; if (seek != 0) iov_iter_advance(&msg->msg_iter, seek); - ret = xs_sock_recv_cmsg(sock, msg, flags); + ret = sock_recvmsg(sock, msg, flags); + /* Handle TLS inband control message lazily */ + if (msg->msg_flags & MSG_CTRUNC) { + msg->msg_flags &= ~(MSG_CTRUNC | MSG_EOR); + if (ret == 0 || ret == -EIO) + ret = xs_sock_recv_cmsg(sock, &msg->msg_flags, flags); + } return ret > 0 ? ret + seek : ret; } @@ -443,7 +463,7 @@ xs_read_discard(struct socket *sock, struct msghdr *msg, int flags, size_t count) { iov_iter_discard(&msg->msg_iter, READ, count); - return xs_sock_recv_cmsg(sock, msg, flags); + return xs_sock_recvmsg(sock, msg, flags, 0); } #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE From c6fdbf2305aba0e177d6df793b6a0abdadccc441 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Thu, 4 Sep 2025 16:09:57 -0500 Subject: [PATCH 2/7] SUNRPC: call xs_sock_process_cmsg for all cmsg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-131164 cve-bf CVE-2025-38571 commit-author Justin Worrell commit 9559d2fffd4f9b892165eed48198a0e5cb8504e6 xs_sock_recv_cmsg was failing to call xs_sock_process_cmsg for any cmsg type other than TLS_RECORD_TYPE_ALERT (TLS_RECORD_TYPE_DATA, and other values not handled.) Based on my reading of the previous commit (cc5d5908: sunrpc: fix client side handling of tls alerts), it looks like only iov_iter_revert should be conditional on TLS_RECORD_TYPE_ALERT (but that other cmsg types should still call xs_sock_process_cmsg). On my machine, I was unable to connect (over mtls) to an NFS share hosted on FreeBSD. With this patch applied, I am able to mount the share again. Fixes: cc5d59081fa2 ("sunrpc: fix client side handling of tls alerts") Signed-off-by: Justin Worrell Reviewed-and-tested-by: Scott Mayhew Link: https://lore.kernel.org/r/20250904211038.12874-3-jworrell@gmail.com Signed-off-by: Trond Myklebust (cherry picked from commit 9559d2fffd4f9b892165eed48198a0e5cb8504e6) Signed-off-by: Marcin Wcisło --- net/sunrpc/xprtsock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 41c395f140b83..02593b1da0224 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -416,9 +416,9 @@ xs_sock_recv_cmsg(struct socket *sock, unsigned int *msg_flags, int flags) iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec, 1, alert_kvec.iov_len); ret = sock_recvmsg(sock, &msg, flags); - if (ret > 0 && - tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) { - iov_iter_revert(&msg.msg_iter, ret); + if (ret > 0) { + if (tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) + iov_iter_revert(&msg.msg_iter, ret); ret = xs_sock_process_cmsg(sock, &msg, msg_flags, &u.cmsg, -EAGAIN); } From 88c19b69cac57fe952776665853a2e4ab9bb6f4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Wed, 21 Jan 2026 00:04:09 +0100 Subject: [PATCH 3/7] tls: stop recv() if initial process_rx_list gave us non-DATA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-131361 cve CVE-2024-58239 commit-author Sabrina Dubroca commit fdfbaec5923d9359698cbb286bc0deadbb717504 If we have a non-DATA record on the rx_list and another record of the same type still on the queue, we will end up merging them: - process_rx_list copies the non-DATA record - we start the loop and process the first available record since it's of the same type - we break out of the loop since the record was not DATA Just check the record type and jump to the end in case process_rx_list did some work. Fixes: 692d7b5d1f91 ("tls: Fix recvmsg() to be able to peek across multiple records") Signed-off-by: Sabrina Dubroca Link: https://lore.kernel.org/r/bd31449e43bd4b6ff546f5c51cf958c31c511deb.1708007371.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski (cherry picked from commit fdfbaec5923d9359698cbb286bc0deadbb717504) Signed-off-by: Marcin Wcisło --- net/tls/tls_sw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 62802de8cd86c..02a59e20b6e87 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2013,7 +2013,7 @@ int tls_sw_recvmsg(struct sock *sk, goto end; copied = err; - if (len <= copied) + if (len <= copied || (copied && control != TLS_RECORD_TYPE_DATA)) goto end; target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); From 7a0b5f9a2da945f30283afd9e91b569d86d86af4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Tue, 20 Jan 2026 23:59:05 +0100 Subject: [PATCH 4/7] net: tls: fix returned read length with async decrypt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-136508 cve-pre CVE-2025-39682 commit-author Jakub Kicinski commit ac437a51ce662364062f704e321227f6728e6adc We double count async, non-zc rx data. The previous fix was lucky because if we fully zc async_copy_bytes is 0 so we add 0. Decrypted already has all the bytes we handled, in all cases. We don't have to adjust anything, delete the erroneous line. Fixes: 4d42cd6bc2ac ("tls: rx: fix return value for async crypto") Co-developed-by: Sabrina Dubroca Signed-off-by: Sabrina Dubroca Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller (cherry picked from commit ac437a51ce662364062f704e321227f6728e6adc) Signed-off-by: Marcin Wcisło --- net/tls/tls_sw.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 02a59e20b6e87..697dfc9d52713 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2174,7 +2174,6 @@ int tls_sw_recvmsg(struct sock *sk, else err = process_rx_list(ctx, msg, &control, 0, async_copy_bytes, is_peek); - decrypted += max(err, 0); } copied += decrypted; From f6252d72f68b2748d813e54c9ef69dabbe3606cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Wed, 21 Jan 2026 00:04:28 +0100 Subject: [PATCH 5/7] tls: don't skip over different type records from the rx_list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-136508 cve-pre CVE-2025-39682 commit-author Sabrina Dubroca commit ec823bf3a479d42c589dc0f28ef4951c49cd2d2a If we queue 3 records: - record 1, type DATA - record 2, some other type - record 3, type DATA and do a recv(PEEK), the rx_list will contain the first two records. The next large recv will walk through the rx_list and copy data from record 1, then stop because record 2 is a different type. Since we haven't filled up our buffer, we will process the next available record. It's also DATA, so we can merge it with the current read. We shouldn't do that, since there was a record in between that we ignored. Add a flag to let process_rx_list inform tls_sw_recvmsg that it had more data available. Fixes: 692d7b5d1f91 ("tls: Fix recvmsg() to be able to peek across multiple records") Signed-off-by: Sabrina Dubroca Link: https://lore.kernel.org/r/f00c0c0afa080c60f016df1471158c1caf983c34.1708007371.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski (cherry picked from commit ec823bf3a479d42c589dc0f28ef4951c49cd2d2a) Signed-off-by: Marcin Wcisło --- net/tls/tls_sw.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 697dfc9d52713..f6581a8567fd6 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1811,7 +1811,8 @@ static int process_rx_list(struct tls_sw_context_rx *ctx, u8 *control, size_t skip, size_t len, - bool is_peek) + bool is_peek, + bool *more) { struct sk_buff *skb = skb_peek(&ctx->rx_list); struct tls_msg *tlm; @@ -1824,7 +1825,7 @@ static int process_rx_list(struct tls_sw_context_rx *ctx, err = tls_record_content_type(msg, tlm, control); if (err <= 0) - goto out; + goto more; if (skip < rxm->full_len) break; @@ -1842,12 +1843,12 @@ static int process_rx_list(struct tls_sw_context_rx *ctx, err = tls_record_content_type(msg, tlm, control); if (err <= 0) - goto out; + goto more; err = skb_copy_datagram_msg(skb, rxm->offset + skip, msg, chunk); if (err < 0) - goto out; + goto more; len = len - chunk; copied = copied + chunk; @@ -1883,6 +1884,10 @@ static int process_rx_list(struct tls_sw_context_rx *ctx, out: return copied ? : err; +more: + if (more) + *more = true; + goto out; } static bool @@ -1987,6 +1992,7 @@ int tls_sw_recvmsg(struct sock *sk, int target, err; bool is_kvec = iov_iter_is_kvec(&msg->msg_iter); bool is_peek = flags & MSG_PEEK; + bool rx_more = false; bool released = true; bool bpf_strp_enabled; bool zc_capable; @@ -2008,12 +2014,12 @@ int tls_sw_recvmsg(struct sock *sk, goto end; /* Process pending decrypted records. It must be non-zero-copy */ - err = process_rx_list(ctx, msg, &control, 0, len, is_peek); + err = process_rx_list(ctx, msg, &control, 0, len, is_peek, &rx_more); if (err < 0) goto end; copied = err; - if (len <= copied || (copied && control != TLS_RECORD_TYPE_DATA)) + if (len <= copied || (copied && control != TLS_RECORD_TYPE_DATA) || rx_more) goto end; target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); @@ -2170,10 +2176,10 @@ int tls_sw_recvmsg(struct sock *sk, /* Drain records from the rx_list & copy if required */ if (is_peek || is_kvec) err = process_rx_list(ctx, msg, &control, copied, - decrypted, is_peek); + decrypted, is_peek, NULL); else err = process_rx_list(ctx, msg, &control, 0, - async_copy_bytes, is_peek); + async_copy_bytes, is_peek, NULL); } copied += decrypted; From 1eda239fd2f21c8f2e3e7b89e4d574c6b11bc793 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Wed, 21 Jan 2026 00:04:40 +0100 Subject: [PATCH 6/7] tls: adjust recv return with async crypto and failed copy to userspace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-136508 cve-pre CVE-2025-39682 commit-author Sabrina Dubroca commit 85eef9a41d019b59be7bc91793f26251909c0710 process_rx_list may not copy as many bytes as we want to the userspace buffer, for example in case we hit an EFAULT during the copy. If this happens, we should only count the bytes that were actually copied, which may be 0. Subtracting async_copy_bytes is correct in both peek and !peek cases, because decrypted == async_copy_bytes + peeked for the peek case: peek is always !ZC, and we can go through either the sync or async path. In the async case, we add chunk to both decrypted and async_copy_bytes. In the sync case, we add chunk to both decrypted and peeked. I missed that in commit 6caaf104423d ("tls: fix peeking with sync+async decryption"). Fixes: 4d42cd6bc2ac ("tls: rx: fix return value for async crypto") Signed-off-by: Sabrina Dubroca Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/1b5a1eaab3c088a9dd5d9f1059ceecd7afe888d1.1711120964.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski (cherry picked from commit 85eef9a41d019b59be7bc91793f26251909c0710) Signed-off-by: Marcin Wcisło --- net/tls/tls_sw.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index f6581a8567fd6..fbf8fb4f65adb 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2180,6 +2180,9 @@ int tls_sw_recvmsg(struct sock *sk, else err = process_rx_list(ctx, msg, &control, 0, async_copy_bytes, is_peek, NULL); + + /* we could have copied less than we wanted, and possibly nothing */ + decrypted += max(err, 0) - async_copy_bytes; } copied += decrypted; From 4ff21a190be80effd32b34b81f7685f7e1f1cfcd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Wcis=C5=82o?= Date: Wed, 21 Jan 2026 00:04:56 +0100 Subject: [PATCH 7/7] tls: fix handling of zero-length records on the rx_list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jira VULN-136508 cve CVE-2025-39682 commit-author Jakub Kicinski commit 62708b9452f8eb77513115b17c4f8d1a22ebf843 Each recvmsg() call must process either - only contiguous DATA records (any number of them) - one non-DATA record If the next record has different type than what has already been processed we break out of the main processing loop. If the record has already been decrypted (which may be the case for TLS 1.3 where we don't know type until decryption) we queue the pending record to the rx_list. Next recvmsg() will pick it up from there. Queuing the skb to rx_list after zero-copy decrypt is not possible, since in that case we decrypted directly to the user space buffer, and we don't have an skb to queue (darg.skb points to the ciphertext skb for access to metadata like length). Only data records are allowed zero-copy, and we break the processing loop after each non-data record. So we should never zero-copy and then find out that the record type has changed. The corner case we missed is when the initial record comes from rx_list, and it's zero length. Reported-by: Muhammad Alifa Ramdhan Reported-by: Billy Jheng Bing-Jhong Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser") Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250820021952.143068-1-kuba@kernel.org Signed-off-by: Jakub Kicinski (cherry picked from commit 62708b9452f8eb77513115b17c4f8d1a22ebf843) Signed-off-by: Marcin Wcisło --- net/tls/tls_sw.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index fbf8fb4f65adb..3ddce41ea0ea8 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1773,6 +1773,9 @@ int decrypt_skb(struct sock *sk, struct scatterlist *sgout) return tls_decrypt_sg(sk, NULL, sgout, &darg); } +/* All records returned from a recvmsg() call must have the same type. + * 0 is not a valid content type. Use it as "no type reported, yet". + */ static int tls_record_content_type(struct msghdr *msg, struct tls_msg *tlm, u8 *control) { @@ -2018,8 +2021,10 @@ int tls_sw_recvmsg(struct sock *sk, if (err < 0) goto end; + /* process_rx_list() will set @control if it processed any records */ copied = err; - if (len <= copied || (copied && control != TLS_RECORD_TYPE_DATA) || rx_more) + if (len <= copied || rx_more || + (control && control != TLS_RECORD_TYPE_DATA)) goto end; target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);