Skip to main content

zerodds_websocket_bridge/
utf8.rs

1// SPDX-License-Identifier: Apache-2.0
2// Copyright 2026 ZeroDDS Contributors
3
4//! UTF-8 Validation nach RFC 6455 §8.1 + §8.2.
5//!
6//! Spec: WebSocket-Server MUST close den Connect mit Status 1007 wenn
7//! ein Text-Frame UTF-8-invalid ist. Symmetrisch fuer den Client.
8//!
9//! Wir validieren strict (RFC 3629), inkl.:
10//! - Surrogate-Pair-Codepoints (U+D800..=U+DFFF) sind verboten.
11//! - Overlong-Encoding ist verboten.
12//! - Codepoints > U+10FFFF sind verboten.
13
14use core::result::Result;
15
16/// UTF-8-Validation-Errors.
17#[derive(Debug, Clone, Copy, PartialEq, Eq)]
18pub enum Utf8Error {
19    /// Truncated multi-byte sequence (Continuation-Byte am Ende fehlt).
20    Truncated,
21    /// Continuation-Byte ohne Lead-Byte (10xxxxxx isoliert).
22    UnexpectedContinuation,
23    /// Lead-Byte ist invalid (z.B. `0xC0`, `0xC1`, `0xF5`-`0xFF`).
24    InvalidLeadByte,
25    /// Surrogate-Codepoint U+D800..=U+DFFF.
26    SurrogateCodepoint,
27    /// Overlong-Encoding (z.B. `0xC0 0x80` fuer NUL).
28    OverlongEncoding,
29    /// Codepoint > U+10FFFF.
30    CodepointOutOfRange,
31    /// Continuation-Byte ist nicht `10xxxxxx`.
32    InvalidContinuation,
33}
34
35impl core::fmt::Display for Utf8Error {
36    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
37        match self {
38            Self::Truncated => write!(f, "Truncated"),
39            Self::UnexpectedContinuation => write!(f, "UnexpectedContinuation"),
40            Self::InvalidLeadByte => write!(f, "InvalidLeadByte"),
41            Self::SurrogateCodepoint => write!(f, "SurrogateCodepoint"),
42            Self::OverlongEncoding => write!(f, "OverlongEncoding"),
43            Self::CodepointOutOfRange => write!(f, "CodepointOutOfRange"),
44            Self::InvalidContinuation => write!(f, "InvalidContinuation"),
45        }
46    }
47}
48
49#[cfg(feature = "std")]
50impl std::error::Error for Utf8Error {}
51
52/// Validiert eine komplette UTF-8-Byte-Sequenz nach RFC 3629.
53///
54/// Liefert `Ok(())` wenn alle Bytes ein gueltiges UTF-8-Stream
55/// sind, sonst die erste gefundene Verletzung.
56///
57/// # Errors
58/// Siehe [`Utf8Error`].
59pub fn validate(bytes: &[u8]) -> Result<(), Utf8Error> {
60    let mut i = 0;
61    while i < bytes.len() {
62        let b0 = bytes[i];
63        let needed = match b0 {
64            0x00..=0x7F => 0, // ASCII
65            0xC0..=0xC1 => return Err(Utf8Error::OverlongEncoding),
66            0xC2..=0xDF => 1, // 2-byte
67            0xE0..=0xEF => 2, // 3-byte
68            0xF0..=0xF4 => 3, // 4-byte
69            0xF5..=0xFF => return Err(Utf8Error::InvalidLeadByte),
70            0x80..=0xBF => return Err(Utf8Error::UnexpectedContinuation),
71        };
72
73        if needed == 0 {
74            i += 1;
75            continue;
76        }
77
78        if i + needed >= bytes.len() {
79            return Err(Utf8Error::Truncated);
80        }
81
82        // Validate continuation bytes are 10xxxxxx
83        for k in 1..=needed {
84            if (bytes[i + k] & 0b1100_0000) != 0b1000_0000 {
85                return Err(Utf8Error::InvalidContinuation);
86            }
87        }
88
89        // Compute codepoint and check overlong + surrogate + range.
90        let cp = match needed {
91            1 => {
92                let cp = (u32::from(b0 & 0b0001_1111) << 6) | u32::from(bytes[i + 1] & 0b0011_1111);
93                if cp < 0x80 {
94                    return Err(Utf8Error::OverlongEncoding);
95                }
96                cp
97            }
98            2 => {
99                let cp = (u32::from(b0 & 0b0000_1111) << 12)
100                    | (u32::from(bytes[i + 1] & 0b0011_1111) << 6)
101                    | u32::from(bytes[i + 2] & 0b0011_1111);
102                if cp < 0x800 {
103                    return Err(Utf8Error::OverlongEncoding);
104                }
105                if (0xD800..=0xDFFF).contains(&cp) {
106                    return Err(Utf8Error::SurrogateCodepoint);
107                }
108                cp
109            }
110            3 => {
111                let cp = (u32::from(b0 & 0b0000_0111) << 18)
112                    | (u32::from(bytes[i + 1] & 0b0011_1111) << 12)
113                    | (u32::from(bytes[i + 2] & 0b0011_1111) << 6)
114                    | u32::from(bytes[i + 3] & 0b0011_1111);
115                if cp < 0x1_0000 {
116                    return Err(Utf8Error::OverlongEncoding);
117                }
118                if cp > 0x10_FFFF {
119                    return Err(Utf8Error::CodepointOutOfRange);
120                }
121                cp
122            }
123            _ => return Err(Utf8Error::InvalidLeadByte),
124        };
125        let _ = cp;
126        i += 1 + needed;
127    }
128    Ok(())
129}
130
131/// Streamender UTF-8-Validator fuer fragmentierte Text-Frames.
132///
133/// WebSocket-Spec §6.2 kann Text-Frames in mehrere DATA-Frames
134/// (FIN=0) aufteilen — der UTF-8-Stream darf dabei mitten in einer
135/// multi-byte-Sequenz unterbrochen werden. Dieser Validator pflegt
136/// einen kleinen internen Puffer fuer das angefangene Codepoint.
137#[derive(Debug, Default)]
138pub struct StreamingValidator {
139    pending: [u8; 4],
140    pending_len: usize,
141    needed: usize,
142}
143
144impl StreamingValidator {
145    /// Konstruktor.
146    #[must_use]
147    pub fn new() -> Self {
148        Self::default()
149    }
150
151    /// Ein Chunk Bytes pruefen. Liefert `Ok(())` wenn (zusammen mit
152    /// vorigem State) alles bisher valid ist.
153    ///
154    /// # Errors
155    /// Siehe [`Utf8Error`].
156    pub fn feed(&mut self, chunk: &[u8]) -> Result<(), Utf8Error> {
157        let mut buf: alloc::vec::Vec<u8> = alloc::vec::Vec::new();
158        buf.extend_from_slice(&self.pending[..self.pending_len]);
159        buf.extend_from_slice(chunk);
160        self.pending_len = 0;
161        self.needed = 0;
162
163        let mut i = 0;
164        while i < buf.len() {
165            let b0 = buf[i];
166            let needed = match b0 {
167                0x00..=0x7F => 0,
168                0xC0..=0xC1 => return Err(Utf8Error::OverlongEncoding),
169                0xC2..=0xDF => 1,
170                0xE0..=0xEF => 2,
171                0xF0..=0xF4 => 3,
172                0xF5..=0xFF => return Err(Utf8Error::InvalidLeadByte),
173                0x80..=0xBF => return Err(Utf8Error::UnexpectedContinuation),
174            };
175
176            if needed == 0 {
177                i += 1;
178                continue;
179            }
180
181            if i + needed >= buf.len() {
182                // Stash partial codepoint for next chunk.
183                let remaining = buf.len() - i;
184                self.pending_len = remaining;
185                self.pending[..remaining].copy_from_slice(&buf[i..]);
186                self.needed = needed - (remaining - 1);
187                return Ok(());
188            }
189
190            // Have full codepoint — validate it.
191            validate(&buf[i..i + 1 + needed])?;
192            i += 1 + needed;
193        }
194
195        Ok(())
196    }
197
198    /// `true` wenn der Stream auf Codepoint-Grenze endet (kein
199    /// pending byte). Ein Text-Frame mit FIN=1 MUSS diesen Zustand
200    /// erreichen — sonst Truncated-Fehler.
201    ///
202    /// # Errors
203    /// `Utf8Error::Truncated` wenn pending Bytes uebrig sind.
204    pub fn finalize(self) -> Result<(), Utf8Error> {
205        if self.pending_len == 0 {
206            Ok(())
207        } else {
208            Err(Utf8Error::Truncated)
209        }
210    }
211}
212
213// ---------------------------------------------------------------------------
214// Tests
215// ---------------------------------------------------------------------------
216
217#[cfg(test)]
218mod tests {
219    use super::*;
220
221    #[test]
222    fn empty_is_valid() {
223        assert!(validate(b"").is_ok());
224    }
225
226    #[test]
227    fn ascii_is_valid() {
228        assert!(validate(b"hello world").is_ok());
229    }
230
231    #[test]
232    fn valid_2_byte_codepoint() {
233        // U+00E9 (é) = 0xC3 0xA9
234        assert!(validate(&[0xC3, 0xA9]).is_ok());
235    }
236
237    #[test]
238    fn valid_3_byte_codepoint() {
239        // U+20AC (€) = 0xE2 0x82 0xAC
240        assert!(validate(&[0xE2, 0x82, 0xAC]).is_ok());
241    }
242
243    #[test]
244    fn valid_4_byte_codepoint() {
245        // U+1F600 (😀) = 0xF0 0x9F 0x98 0x80
246        assert!(validate(&[0xF0, 0x9F, 0x98, 0x80]).is_ok());
247    }
248
249    #[test]
250    fn rejects_overlong_2_byte_for_ascii() {
251        // 0xC0 0x80 would encode U+0000 in 2 bytes (overlong)
252        assert_eq!(validate(&[0xC0, 0x80]), Err(Utf8Error::OverlongEncoding));
253    }
254
255    #[test]
256    fn rejects_unexpected_continuation_byte() {
257        assert_eq!(validate(&[0x80]), Err(Utf8Error::UnexpectedContinuation));
258    }
259
260    #[test]
261    fn rejects_invalid_lead_byte() {
262        assert_eq!(validate(&[0xFF]), Err(Utf8Error::InvalidLeadByte));
263    }
264
265    #[test]
266    fn rejects_truncated_2_byte() {
267        // 0xC3 (lead) ohne continuation
268        assert_eq!(validate(&[0xC3]), Err(Utf8Error::Truncated));
269    }
270
271    #[test]
272    fn rejects_truncated_3_byte() {
273        assert_eq!(validate(&[0xE2, 0x82]), Err(Utf8Error::Truncated));
274    }
275
276    #[test]
277    fn rejects_invalid_continuation() {
278        // 0xC3 0x00 — second byte is not 10xxxxxx
279        assert_eq!(validate(&[0xC3, 0x00]), Err(Utf8Error::InvalidContinuation));
280    }
281
282    #[test]
283    fn rejects_surrogate_codepoint() {
284        // U+D800 = 0xED 0xA0 0x80 — surrogate
285        assert_eq!(
286            validate(&[0xED, 0xA0, 0x80]),
287            Err(Utf8Error::SurrogateCodepoint)
288        );
289    }
290
291    #[test]
292    fn rejects_codepoint_above_max() {
293        // 0xF4 0x90 0x80 0x80 = U+110000 (just out of range)
294        assert_eq!(
295            validate(&[0xF4, 0x90, 0x80, 0x80]),
296            Err(Utf8Error::CodepointOutOfRange)
297        );
298    }
299
300    #[test]
301    fn streaming_handles_split_codepoint() {
302        // U+20AC split: 0xE2 in chunk 1, 0x82 0xAC in chunk 2
303        let mut v = StreamingValidator::new();
304        assert!(v.feed(&[0xE2]).is_ok());
305        assert!(v.feed(&[0x82, 0xAC]).is_ok());
306        assert!(v.finalize().is_ok());
307    }
308
309    #[test]
310    fn streaming_finalize_with_pending_is_truncated() {
311        let mut v = StreamingValidator::new();
312        assert!(v.feed(&[0xE2, 0x82]).is_ok());
313        assert_eq!(v.finalize(), Err(Utf8Error::Truncated));
314    }
315
316    #[test]
317    fn streaming_complete_codepoint_in_one_chunk() {
318        let mut v = StreamingValidator::new();
319        assert!(v.feed(b"hello").is_ok());
320        assert!(v.finalize().is_ok());
321    }
322}