zerodds_websocket_bridge/
utf8.rs1use core::result::Result;
15
16#[derive(Debug, Clone, Copy, PartialEq, Eq)]
18pub enum Utf8Error {
19 Truncated,
21 UnexpectedContinuation,
23 InvalidLeadByte,
25 SurrogateCodepoint,
27 OverlongEncoding,
29 CodepointOutOfRange,
31 InvalidContinuation,
33}
34
35impl core::fmt::Display for Utf8Error {
36 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
37 match self {
38 Self::Truncated => write!(f, "Truncated"),
39 Self::UnexpectedContinuation => write!(f, "UnexpectedContinuation"),
40 Self::InvalidLeadByte => write!(f, "InvalidLeadByte"),
41 Self::SurrogateCodepoint => write!(f, "SurrogateCodepoint"),
42 Self::OverlongEncoding => write!(f, "OverlongEncoding"),
43 Self::CodepointOutOfRange => write!(f, "CodepointOutOfRange"),
44 Self::InvalidContinuation => write!(f, "InvalidContinuation"),
45 }
46 }
47}
48
49#[cfg(feature = "std")]
50impl std::error::Error for Utf8Error {}
51
52pub fn validate(bytes: &[u8]) -> Result<(), Utf8Error> {
60 let mut i = 0;
61 while i < bytes.len() {
62 let b0 = bytes[i];
63 let needed = match b0 {
64 0x00..=0x7F => 0, 0xC0..=0xC1 => return Err(Utf8Error::OverlongEncoding),
66 0xC2..=0xDF => 1, 0xE0..=0xEF => 2, 0xF0..=0xF4 => 3, 0xF5..=0xFF => return Err(Utf8Error::InvalidLeadByte),
70 0x80..=0xBF => return Err(Utf8Error::UnexpectedContinuation),
71 };
72
73 if needed == 0 {
74 i += 1;
75 continue;
76 }
77
78 if i + needed >= bytes.len() {
79 return Err(Utf8Error::Truncated);
80 }
81
82 for k in 1..=needed {
84 if (bytes[i + k] & 0b1100_0000) != 0b1000_0000 {
85 return Err(Utf8Error::InvalidContinuation);
86 }
87 }
88
89 let cp = match needed {
91 1 => {
92 let cp = (u32::from(b0 & 0b0001_1111) << 6) | u32::from(bytes[i + 1] & 0b0011_1111);
93 if cp < 0x80 {
94 return Err(Utf8Error::OverlongEncoding);
95 }
96 cp
97 }
98 2 => {
99 let cp = (u32::from(b0 & 0b0000_1111) << 12)
100 | (u32::from(bytes[i + 1] & 0b0011_1111) << 6)
101 | u32::from(bytes[i + 2] & 0b0011_1111);
102 if cp < 0x800 {
103 return Err(Utf8Error::OverlongEncoding);
104 }
105 if (0xD800..=0xDFFF).contains(&cp) {
106 return Err(Utf8Error::SurrogateCodepoint);
107 }
108 cp
109 }
110 3 => {
111 let cp = (u32::from(b0 & 0b0000_0111) << 18)
112 | (u32::from(bytes[i + 1] & 0b0011_1111) << 12)
113 | (u32::from(bytes[i + 2] & 0b0011_1111) << 6)
114 | u32::from(bytes[i + 3] & 0b0011_1111);
115 if cp < 0x1_0000 {
116 return Err(Utf8Error::OverlongEncoding);
117 }
118 if cp > 0x10_FFFF {
119 return Err(Utf8Error::CodepointOutOfRange);
120 }
121 cp
122 }
123 _ => return Err(Utf8Error::InvalidLeadByte),
124 };
125 let _ = cp;
126 i += 1 + needed;
127 }
128 Ok(())
129}
130
131#[derive(Debug, Default)]
138pub struct StreamingValidator {
139 pending: [u8; 4],
140 pending_len: usize,
141 needed: usize,
142}
143
144impl StreamingValidator {
145 #[must_use]
147 pub fn new() -> Self {
148 Self::default()
149 }
150
151 pub fn feed(&mut self, chunk: &[u8]) -> Result<(), Utf8Error> {
157 let mut buf: alloc::vec::Vec<u8> = alloc::vec::Vec::new();
158 buf.extend_from_slice(&self.pending[..self.pending_len]);
159 buf.extend_from_slice(chunk);
160 self.pending_len = 0;
161 self.needed = 0;
162
163 let mut i = 0;
164 while i < buf.len() {
165 let b0 = buf[i];
166 let needed = match b0 {
167 0x00..=0x7F => 0,
168 0xC0..=0xC1 => return Err(Utf8Error::OverlongEncoding),
169 0xC2..=0xDF => 1,
170 0xE0..=0xEF => 2,
171 0xF0..=0xF4 => 3,
172 0xF5..=0xFF => return Err(Utf8Error::InvalidLeadByte),
173 0x80..=0xBF => return Err(Utf8Error::UnexpectedContinuation),
174 };
175
176 if needed == 0 {
177 i += 1;
178 continue;
179 }
180
181 if i + needed >= buf.len() {
182 let remaining = buf.len() - i;
184 self.pending_len = remaining;
185 self.pending[..remaining].copy_from_slice(&buf[i..]);
186 self.needed = needed - (remaining - 1);
187 return Ok(());
188 }
189
190 validate(&buf[i..i + 1 + needed])?;
192 i += 1 + needed;
193 }
194
195 Ok(())
196 }
197
198 pub fn finalize(self) -> Result<(), Utf8Error> {
205 if self.pending_len == 0 {
206 Ok(())
207 } else {
208 Err(Utf8Error::Truncated)
209 }
210 }
211}
212
213#[cfg(test)]
218mod tests {
219 use super::*;
220
221 #[test]
222 fn empty_is_valid() {
223 assert!(validate(b"").is_ok());
224 }
225
226 #[test]
227 fn ascii_is_valid() {
228 assert!(validate(b"hello world").is_ok());
229 }
230
231 #[test]
232 fn valid_2_byte_codepoint() {
233 assert!(validate(&[0xC3, 0xA9]).is_ok());
235 }
236
237 #[test]
238 fn valid_3_byte_codepoint() {
239 assert!(validate(&[0xE2, 0x82, 0xAC]).is_ok());
241 }
242
243 #[test]
244 fn valid_4_byte_codepoint() {
245 assert!(validate(&[0xF0, 0x9F, 0x98, 0x80]).is_ok());
247 }
248
249 #[test]
250 fn rejects_overlong_2_byte_for_ascii() {
251 assert_eq!(validate(&[0xC0, 0x80]), Err(Utf8Error::OverlongEncoding));
253 }
254
255 #[test]
256 fn rejects_unexpected_continuation_byte() {
257 assert_eq!(validate(&[0x80]), Err(Utf8Error::UnexpectedContinuation));
258 }
259
260 #[test]
261 fn rejects_invalid_lead_byte() {
262 assert_eq!(validate(&[0xFF]), Err(Utf8Error::InvalidLeadByte));
263 }
264
265 #[test]
266 fn rejects_truncated_2_byte() {
267 assert_eq!(validate(&[0xC3]), Err(Utf8Error::Truncated));
269 }
270
271 #[test]
272 fn rejects_truncated_3_byte() {
273 assert_eq!(validate(&[0xE2, 0x82]), Err(Utf8Error::Truncated));
274 }
275
276 #[test]
277 fn rejects_invalid_continuation() {
278 assert_eq!(validate(&[0xC3, 0x00]), Err(Utf8Error::InvalidContinuation));
280 }
281
282 #[test]
283 fn rejects_surrogate_codepoint() {
284 assert_eq!(
286 validate(&[0xED, 0xA0, 0x80]),
287 Err(Utf8Error::SurrogateCodepoint)
288 );
289 }
290
291 #[test]
292 fn rejects_codepoint_above_max() {
293 assert_eq!(
295 validate(&[0xF4, 0x90, 0x80, 0x80]),
296 Err(Utf8Error::CodepointOutOfRange)
297 );
298 }
299
300 #[test]
301 fn streaming_handles_split_codepoint() {
302 let mut v = StreamingValidator::new();
304 assert!(v.feed(&[0xE2]).is_ok());
305 assert!(v.feed(&[0x82, 0xAC]).is_ok());
306 assert!(v.finalize().is_ok());
307 }
308
309 #[test]
310 fn streaming_finalize_with_pending_is_truncated() {
311 let mut v = StreamingValidator::new();
312 assert!(v.feed(&[0xE2, 0x82]).is_ok());
313 assert_eq!(v.finalize(), Err(Utf8Error::Truncated));
314 }
315
316 #[test]
317 fn streaming_complete_codepoint_in_one_chunk() {
318 let mut v = StreamingValidator::new();
319 assert!(v.feed(b"hello").is_ok());
320 assert!(v.finalize().is_ok());
321 }
322}