1#![doc = include_str!("../README.md")]
2#![cfg_attr(not(test), no_std)]
3#![warn(missing_docs)]
5#![forbid(unsafe_code)]
7#![allow(clippy::unusual_byte_groupings)]
9
10mod error;
11pub use error::Utf8ParserError;
12
13const FIRST_CODE_POINT_FOR_DOUBLE: u32 = 0x80;
14const FIRST_CODE_POINT_FOR_TRIPLE: u32 = 0x800;
15const FIRST_CODE_POINT_FOR_QUADRUPLE: u32 = 0x10000;
16
17#[derive(Copy, Clone, PartialEq, Eq, Debug, Hash)]
34pub enum Utf8ByteType {
35 Continuation,
37 Single,
39 Double,
41 Triple,
43 Quadruple,
45}
46
47impl Utf8ByteType {
48 pub const fn of(byte: u8) -> Result<Self, Utf8ParserError> {
50 use Utf8ByteType::*;
51 let kinds = [Continuation, Single, Double, Triple, Quadruple];
52
53 let mut i = 0;
54 while i < kinds.len() {
55 if kinds[i].matches(byte) {
56 return Ok(kinds[i]);
57 }
58 i += 1;
59 }
60
61 Err(Utf8ParserError::InvalidByte(byte))
62 }
63
64 pub const fn is_continuation(self) -> bool {
66 matches!(self, Self::Continuation)
67 }
68
69 const fn id(self) -> u8 {
70 match self {
71 Self::Single => 0b0,
72 Self::Continuation => 0b10,
73 Self::Double => 0b110,
74 Self::Triple => 0b1110,
75 Self::Quadruple => 0b11110,
76 }
77 }
78
79 const fn id_length(self) -> u32 {
80 self.id().count_ones() + 1
81 }
82
83 const fn value_mask(self) -> u8 {
84 0xFF >> self.id_length()
85 }
86
87 const fn value_mask_length(self) -> u32 {
88 self.value_mask().count_ones()
89 }
90
91 const fn matches(self, byte: u8) -> bool {
92 (byte >> self.value_mask_length()) == self.id()
93 }
94}
95
96#[derive(Copy, Clone, Debug, PartialEq, Eq)]
98enum ParsedByte {
99 Single(u8),
101 StartDouble(u8),
103 StartTriple(u8),
105 StartQuadruple(u8),
107 ContinuationByte(u8),
109}
110
111impl ParsedByte {
112 const fn from_byte(byte: u8) -> Result<Self, Utf8ParserError> {
114 use Utf8ByteType::*;
115 let kind = match Utf8ByteType::of(byte) {
116 Ok(val) => val,
117 Err(err) => {
118 return Err(err);
119 }
120 };
121 let value = byte & kind.value_mask();
122
123 Ok(match kind {
124 Continuation => Self::ContinuationByte(value),
125 Single => Self::Single(value),
126 Double => Self::StartDouble(value),
127 Triple => Self::StartTriple(value),
128 Quadruple => Self::StartQuadruple(value),
129 })
130 }
131}
132
133#[derive(Copy, Clone, Debug)]
134enum State {
135 Fresh,
136 OneLeft(u32),
137 TwoLeft(u32),
138 ThreeLeft(u32),
139}
140
141const fn push_byte(current: u32, byte: u8) -> u32 {
142 debug_assert!(current <= 0x00FFFFFF);
143 debug_assert!(byte <= 0b0011_1111);
144 (current << Utf8ByteType::Continuation.value_mask_length()) | (byte as u32)
145}
146
147#[derive(Clone, Debug)]
168pub struct Utf8Parser {
169 state: State,
170}
171
172impl Utf8Parser {
173 pub const fn new() -> Self {
175 Self {
176 state: State::Fresh,
177 }
178 }
179
180 pub const fn push(&mut self, byte: u8) -> Result<Option<char>, Utf8ParserError> {
182 match self.push_inner_impl(byte) {
183 Ok(val) => Ok(val),
184 Err(val) => {
186 self.reset();
187 Err(val)
188 }
189 }
190 }
191
192 const fn push_inner_impl(&mut self, byte: u8) -> Result<Option<char>, Utf8ParserError> {
194 let byte = match ParsedByte::from_byte(byte) {
195 Ok(v) => v,
196 Err(e) => {
197 return Err(e);
198 }
199 };
200
201 match (self.state, byte) {
202 (State::OneLeft(current), ParsedByte::ContinuationByte(value)) => {
203 self.state = State::Fresh;
204 let val = push_byte(current, value);
205 if val < FIRST_CODE_POINT_FOR_DOUBLE {
206 return Err(Utf8ParserError::OverlongEncoding);
207 }
208 match char::from_u32(val) {
209 Some(val) => Ok(Some(val)),
210 None => Err(Utf8ParserError::InvalidChar(val)),
211 }
212 }
213 (State::TwoLeft(current), ParsedByte::ContinuationByte(value)) => {
214 let val = push_byte(current, value);
215 if val << Utf8ByteType::Continuation.value_mask_length()
216 < FIRST_CODE_POINT_FOR_TRIPLE
217 {
218 return Err(Utf8ParserError::OverlongEncoding);
219 }
220 self.state = State::OneLeft(val);
221 Ok(None)
222 }
223 (State::ThreeLeft(current), ParsedByte::ContinuationByte(value)) => {
224 let val = push_byte(current, value);
225 if val << (2 * Utf8ByteType::Continuation.value_mask_length())
226 < FIRST_CODE_POINT_FOR_QUADRUPLE
227 {
228 return Err(Utf8ParserError::OverlongEncoding);
229 }
230 self.state = State::TwoLeft(val);
231 Ok(None)
232 }
233 (State::Fresh, ParsedByte::Single(value)) => Ok(Some(value as char)),
234 (State::Fresh, ParsedByte::StartDouble(value)) => {
235 self.state = State::OneLeft(value as u32);
236 Ok(None)
237 }
238 (State::Fresh, ParsedByte::StartTriple(value)) => {
239 self.state = State::TwoLeft(value as u32);
240 Ok(None)
241 }
242 (State::Fresh, ParsedByte::StartQuadruple(value)) => {
243 self.state = State::ThreeLeft(value as u32);
244 Ok(None)
245 }
246 (
247 State::OneLeft(_) | State::TwoLeft(_) | State::ThreeLeft(_),
248 ParsedByte::Single(value)
249 | ParsedByte::StartDouble(value)
250 | ParsedByte::StartTriple(value)
251 | ParsedByte::StartQuadruple(value),
252 ) => Err(Utf8ParserError::UnexpectedStartByte(value)),
253 (State::Fresh, ParsedByte::ContinuationByte(value)) => {
254 Err(Utf8ParserError::UnexpectedContinuationByte(value))
255 }
256 }
257 }
258
259 const fn reset(&mut self) {
261 self.state = State::Fresh;
262 }
263}
264
265impl Default for Utf8Parser {
266 fn default() -> Self {
267 Self::new()
268 }
269}
270
271#[cfg(test)]
272mod tests {
273 use super::*;
274 use rand::Rng;
275
276 #[test]
277 fn conversion() -> Result<(), Utf8ParserError> {
278 let test_vectors = &[
279 (0x00, ParsedByte::Single(0x00)),
280 (0x01, ParsedByte::Single(0x01)),
281 (0x65, ParsedByte::Single(0x65)),
282 (0x7f, ParsedByte::Single(0x7f)),
283 (0b110_00000, ParsedByte::StartDouble(0)),
284 (0b110_00001, ParsedByte::StartDouble(0b1)),
285 (0b110_11001, ParsedByte::StartDouble(0b11001)),
286 (0b110_11111, ParsedByte::StartDouble(0b11111)),
287 (0b1110_0000, ParsedByte::StartTriple(0)),
288 (0b1110_0001, ParsedByte::StartTriple(0b1)),
289 (0b1110_1001, ParsedByte::StartTriple(0b1001)),
290 (0b1110_1111, ParsedByte::StartTriple(0b1111)),
291 (0b1111_0000, ParsedByte::StartQuadruple(0)),
292 (0b1111_0001, ParsedByte::StartQuadruple(0b1)),
293 (0b1111_0111, ParsedByte::StartQuadruple(0b111)),
294 (0x80, ParsedByte::ContinuationByte(0x00)),
295 (0x81, ParsedByte::ContinuationByte(0x01)),
296 (0b10_111111, ParsedByte::ContinuationByte(0b111111)),
297 ];
298
299 for tv in test_vectors.iter() {
300 assert_eq!(ParsedByte::from_byte(tv.0)?, tv.1);
301 }
302
303 Ok(())
304 }
305
306 #[test]
307 fn basic() -> Result<(), Utf8ParserError> {
308 let mut parser = Utf8Parser::default();
309 assert_eq!(parser.push(b'h')?, Some('h'));
310 assert_eq!(parser.push(b'e')?, Some('e'));
311 assert_eq!(parser.push(b'l')?, Some('l'));
312 assert_eq!(parser.push(b'l')?, Some('l'));
313 assert_eq!(parser.push(b'o')?, Some('o'));
314 assert_eq!(parser.push(0b1101_0000)?, None);
315 Ok(())
316 }
317
318 fn parse_str_by_bytes(original: &[u8]) -> Result<String, Utf8ParserError> {
319 let mut rebuilt = String::new();
320
321 let mut parser = Utf8Parser::default();
322 for byte in original {
323 if let Some(c) = parser.push(*byte)? {
324 rebuilt.push(c);
325 }
326 }
327
328 assert_eq!(String::from_utf8(original.into()).unwrap(), rebuilt);
329
330 Ok(rebuilt)
331 }
332
333 #[test]
334 fn parse_ascii_stream() -> Result<(), Utf8ParserError> {
335 parse_str_by_bytes("The quick brown fox jamped over the lazy dog".as_bytes())?;
336 Ok(())
337 }
338
339 #[test]
340 fn parse_emoji_stream() -> Result<(), Utf8ParserError> {
341 parse_str_by_bytes("ThΓ© quick brown π¦ jamped over the lazy π".as_bytes())?;
342 Ok(())
343 }
344
345 #[test]
346 fn reset_state_after_error() {
347 let mut parser = Utf8Parser::new();
348
349 assert!(parser.push(0b1110_0000).is_ok());
351 assert!(parser.push(0b1111_1110).is_err());
353 assert_eq!(parser.push(b'a'), Ok(Some('a')));
354 }
355
356 #[test]
357 const fn const_usage() {
358 let mut parser = Utf8Parser::new();
359
360 assert!(matches!(parser.push(0xf0), Ok(None)));
361 assert!(matches!(parser.push(0x9f), Ok(None)));
362 assert!(matches!(parser.push(0x90), Ok(None)));
363 assert!(matches!(parser.push(0x95), Ok(Some('π'))));
364 }
365
366 #[test]
367 fn error_on_overlong_encodings() {
368 let good: Vec<(&[u8], u32)> = vec![
369 (&[0b0_0000000], 0x00),
371 (&[0b0_1111111], 0x7f),
373 (&[0b110_00010, 0b10_000000], 0x80),
375 (&[0b110_11111, 0b10_111111], 0x7ff),
377 (&[0b1110_0000, 0b10_100000, 0b10_000000], 0x800),
379 (&[0b1110_1111, 0b10_111111, 0b10_111111], 0xFFFF),
381 (
383 &[0b11110_000, 0b10_010000, 0b10_000000, 0b10_000000],
384 0x10000,
385 ),
386 (
388 &[0b11110_100, 0b10_001111, 0b10_111111, 0b10_111111],
389 0x10FFFF,
390 ),
391 ];
392 let overlong: Vec<&[u8]> = vec![
393 &[0b110_00000, 0b10_000000],
395 &[0b110_00001, 0b10_111111],
397 &[0b1110_0000, 0b10_000000, 0b10_000000],
399 &[0b1110_0000, 0b10_011111, 0b10_111111],
401 &[0b11110_000, 0b10_000000, 0b10_000000, 0b10_000000],
403 &[0b11110_000, 0b10_001111, 0b10_000000, 0b10_111111],
405 ];
406 let err_but_not_overlong: Vec<&[u8]> = vec![
407 &[0b11110_110, 0b10_000000, 0b10_000000, 0b10_000000],
409 ];
410
411 for tv in good {
412 assert_eq!(
413 parse_str_by_bytes(tv.0).unwrap().chars().next().unwrap() as u32,
414 tv.1
415 );
416 }
417
418 for tv in overlong {
419 assert_eq!(
420 parse_str_by_bytes(tv).unwrap_err(),
421 Utf8ParserError::OverlongEncoding
422 );
423 }
424
425 for tv in err_but_not_overlong {
426 assert_ne!(
427 parse_str_by_bytes(tv).unwrap_err(),
428 Utf8ParserError::OverlongEncoding
429 );
430 }
431 }
432
433 #[test]
434 fn random_input_dont_panic() {
435 let mut parser = Utf8Parser::default();
436 let mut rng = rand::rng();
437 for _ in 0..1_000_000 {
438 let _ = parser.push(rng.random());
439 }
440 }
441
442 #[test]
443 fn random_ascii_dont_error() {
444 let mut parser = Utf8Parser::default();
445 let mut rng = rand::rng();
446 for _ in 0..1_000_000 {
447 let val: u8 = rng.random();
448 parser.push(val % 0x80).unwrap();
449 }
450 }
451}