utf8_parser/
lib.rs

1#![doc = include_str!("../README.md")]
2#![cfg_attr(not(test), no_std)]
3// Make sure our crate is documented
4#![warn(missing_docs)]
5// Makes us green in cargo-geiger
6#![forbid(unsafe_code)]
7// Allowing arbitrary bit groupings makes readability easier in this context.
8#![allow(clippy::unusual_byte_groupings)]
9
10mod error;
11pub use error::Utf8ParserError;
12
13const FIRST_CODE_POINT_FOR_DOUBLE: u32 = 0x80;
14const FIRST_CODE_POINT_FOR_TRIPLE: u32 = 0x800;
15const FIRST_CODE_POINT_FOR_QUADRUPLE: u32 = 0x10000;
16
17/// Categorization of a valid byte in UTF-8
18///
19/// # Example
20/// ```
21/// # fn main() -> Result<(), utf8_parser::Utf8ParserError> {
22/// use utf8_parser::Utf8ByteType;
23///
24/// assert_eq!(Utf8ByteType::of(0b00000010)?, Utf8ByteType::Single);
25/// assert_eq!(Utf8ByteType::of(0b10000010)?, Utf8ByteType::Continuation);
26/// assert_eq!(Utf8ByteType::of(0b11000010)?, Utf8ByteType::Double);
27/// assert_eq!(Utf8ByteType::of(0b11100010)?, Utf8ByteType::Triple);
28/// assert_eq!(Utf8ByteType::of(0b11110010)?, Utf8ByteType::Quadruple);
29/// assert!(Utf8ByteType::of(0b11111010).is_err());
30/// # Ok(())
31/// # }
32/// ```
33#[derive(Copy, Clone, PartialEq, Eq, Debug, Hash)]
34pub enum Utf8ByteType {
35    /// A continuation byte
36    Continuation,
37    /// A one-byte UTF-8 character, i.e. an ASCII value
38    Single,
39    /// A start byte that must be followed by one continuation byte
40    Double,
41    /// A start byte that must be followed by two continuation bytes
42    Triple,
43    /// A start byte that must be followed by three continuation bytes
44    Quadruple,
45}
46
47impl Utf8ByteType {
48    /// Get type of byte
49    pub const fn of(byte: u8) -> Result<Self, Utf8ParserError> {
50        use Utf8ByteType::*;
51        let kinds = [Continuation, Single, Double, Triple, Quadruple];
52
53        let mut i = 0;
54        while i < kinds.len() {
55            if kinds[i].matches(byte) {
56                return Ok(kinds[i]);
57            }
58            i += 1;
59        }
60
61        Err(Utf8ParserError::InvalidByte(byte))
62    }
63
64    /// Returns true if this is a continuation byte
65    pub const fn is_continuation(self) -> bool {
66        matches!(self, Self::Continuation)
67    }
68
69    const fn id(self) -> u8 {
70        match self {
71            Self::Single => 0b0,
72            Self::Continuation => 0b10,
73            Self::Double => 0b110,
74            Self::Triple => 0b1110,
75            Self::Quadruple => 0b11110,
76        }
77    }
78
79    const fn id_length(self) -> u32 {
80        self.id().count_ones() + 1
81    }
82
83    const fn value_mask(self) -> u8 {
84        0xFF >> self.id_length()
85    }
86
87    const fn value_mask_length(self) -> u32 {
88        self.value_mask().count_ones()
89    }
90
91    const fn matches(self, byte: u8) -> bool {
92        (byte >> self.value_mask_length()) == self.id()
93    }
94}
95
96// A single byte from a UTF-8 stream
97#[derive(Copy, Clone, Debug, PartialEq, Eq)]
98enum ParsedByte {
99    // A one-byte UTF-8 character, i.e. an ASCII value
100    Single(u8),
101    // A start byte that must be followed by one continuation byte
102    StartDouble(u8),
103    // A start byte that must be followed by two continuation bytes
104    StartTriple(u8),
105    // A start byte that must be followed by three continuation bytes
106    StartQuadruple(u8),
107    // A continuation byte
108    ContinuationByte(u8),
109}
110
111impl ParsedByte {
112    // Construct from a byte
113    const fn from_byte(byte: u8) -> Result<Self, Utf8ParserError> {
114        use Utf8ByteType::*;
115        let kind = match Utf8ByteType::of(byte) {
116            Ok(val) => val,
117            Err(err) => {
118                return Err(err);
119            }
120        };
121        let value = byte & kind.value_mask();
122
123        Ok(match kind {
124            Continuation => Self::ContinuationByte(value),
125            Single => Self::Single(value),
126            Double => Self::StartDouble(value),
127            Triple => Self::StartTriple(value),
128            Quadruple => Self::StartQuadruple(value),
129        })
130    }
131}
132
133#[derive(Copy, Clone, Debug)]
134enum State {
135    Fresh,
136    OneLeft(u32),
137    TwoLeft(u32),
138    ThreeLeft(u32),
139}
140
141const fn push_byte(current: u32, byte: u8) -> u32 {
142    debug_assert!(current <= 0x00FFFFFF);
143    debug_assert!(byte <= 0b0011_1111);
144    (current << Utf8ByteType::Continuation.value_mask_length()) | (byte as u32)
145}
146
147/// A stateful UTF-8 parser.
148///
149/// # Behavior on Errors
150///
151/// [Utf8Parser] will reset on errors. Example:
152///
153/// ```
154/// # fn main() -> Result<(), utf8_parser::Utf8ParserError> {
155/// use utf8_parser::Utf8Parser;
156///
157/// let mut parser = Utf8Parser::new();
158/// // Utf-8 start byte
159/// assert!(parser.push(0xf0)?.is_none());
160/// // A continuation byte is expected here, but we're pushing an ASCII char
161/// assert!(parser.push(b'a').is_err());
162/// // The state is reset, so this now no longer errors
163/// assert_eq!(parser.push(b'a'), Ok(Some('a')));
164/// # Ok(())
165/// # }
166/// ```
167#[derive(Clone, Debug)]
168pub struct Utf8Parser {
169    state: State,
170}
171
172impl Utf8Parser {
173    /// Construct a new Utf8Parser
174    pub const fn new() -> Self {
175        Self {
176            state: State::Fresh,
177        }
178    }
179
180    /// Push a byte into the parser
181    pub const fn push(&mut self, byte: u8) -> Result<Option<char>, Utf8ParserError> {
182        match self.push_inner_impl(byte) {
183            Ok(val) => Ok(val),
184            // Reset on error
185            Err(val) => {
186                self.reset();
187                Err(val)
188            }
189        }
190    }
191
192    // Inner functionality of `push`
193    const fn push_inner_impl(&mut self, byte: u8) -> Result<Option<char>, Utf8ParserError> {
194        let byte = match ParsedByte::from_byte(byte) {
195            Ok(v) => v,
196            Err(e) => {
197                return Err(e);
198            }
199        };
200
201        match (self.state, byte) {
202            (State::OneLeft(current), ParsedByte::ContinuationByte(value)) => {
203                self.state = State::Fresh;
204                let val = push_byte(current, value);
205                if val < FIRST_CODE_POINT_FOR_DOUBLE {
206                    return Err(Utf8ParserError::OverlongEncoding);
207                }
208                match char::from_u32(val) {
209                    Some(val) => Ok(Some(val)),
210                    None => Err(Utf8ParserError::InvalidChar(val)),
211                }
212            }
213            (State::TwoLeft(current), ParsedByte::ContinuationByte(value)) => {
214                let val = push_byte(current, value);
215                if val << Utf8ByteType::Continuation.value_mask_length()
216                    < FIRST_CODE_POINT_FOR_TRIPLE
217                {
218                    return Err(Utf8ParserError::OverlongEncoding);
219                }
220                self.state = State::OneLeft(val);
221                Ok(None)
222            }
223            (State::ThreeLeft(current), ParsedByte::ContinuationByte(value)) => {
224                let val = push_byte(current, value);
225                if val << (2 * Utf8ByteType::Continuation.value_mask_length())
226                    < FIRST_CODE_POINT_FOR_QUADRUPLE
227                {
228                    return Err(Utf8ParserError::OverlongEncoding);
229                }
230                self.state = State::TwoLeft(val);
231                Ok(None)
232            }
233            (State::Fresh, ParsedByte::Single(value)) => Ok(Some(value as char)),
234            (State::Fresh, ParsedByte::StartDouble(value)) => {
235                self.state = State::OneLeft(value as u32);
236                Ok(None)
237            }
238            (State::Fresh, ParsedByte::StartTriple(value)) => {
239                self.state = State::TwoLeft(value as u32);
240                Ok(None)
241            }
242            (State::Fresh, ParsedByte::StartQuadruple(value)) => {
243                self.state = State::ThreeLeft(value as u32);
244                Ok(None)
245            }
246            (
247                State::OneLeft(_) | State::TwoLeft(_) | State::ThreeLeft(_),
248                ParsedByte::Single(value)
249                | ParsedByte::StartDouble(value)
250                | ParsedByte::StartTriple(value)
251                | ParsedByte::StartQuadruple(value),
252            ) => Err(Utf8ParserError::UnexpectedStartByte(value)),
253            (State::Fresh, ParsedByte::ContinuationByte(value)) => {
254                Err(Utf8ParserError::UnexpectedContinuationByte(value))
255            }
256        }
257    }
258
259    // Reset the state
260    const fn reset(&mut self) {
261        self.state = State::Fresh;
262    }
263}
264
265impl Default for Utf8Parser {
266    fn default() -> Self {
267        Self::new()
268    }
269}
270
271#[cfg(test)]
272mod tests {
273    use super::*;
274    use rand::Rng;
275
276    #[test]
277    fn conversion() -> Result<(), Utf8ParserError> {
278        let test_vectors = &[
279            (0x00, ParsedByte::Single(0x00)),
280            (0x01, ParsedByte::Single(0x01)),
281            (0x65, ParsedByte::Single(0x65)),
282            (0x7f, ParsedByte::Single(0x7f)),
283            (0b110_00000, ParsedByte::StartDouble(0)),
284            (0b110_00001, ParsedByte::StartDouble(0b1)),
285            (0b110_11001, ParsedByte::StartDouble(0b11001)),
286            (0b110_11111, ParsedByte::StartDouble(0b11111)),
287            (0b1110_0000, ParsedByte::StartTriple(0)),
288            (0b1110_0001, ParsedByte::StartTriple(0b1)),
289            (0b1110_1001, ParsedByte::StartTriple(0b1001)),
290            (0b1110_1111, ParsedByte::StartTriple(0b1111)),
291            (0b1111_0000, ParsedByte::StartQuadruple(0)),
292            (0b1111_0001, ParsedByte::StartQuadruple(0b1)),
293            (0b1111_0111, ParsedByte::StartQuadruple(0b111)),
294            (0x80, ParsedByte::ContinuationByte(0x00)),
295            (0x81, ParsedByte::ContinuationByte(0x01)),
296            (0b10_111111, ParsedByte::ContinuationByte(0b111111)),
297        ];
298
299        for tv in test_vectors.iter() {
300            assert_eq!(ParsedByte::from_byte(tv.0)?, tv.1);
301        }
302
303        Ok(())
304    }
305
306    #[test]
307    fn basic() -> Result<(), Utf8ParserError> {
308        let mut parser = Utf8Parser::default();
309        assert_eq!(parser.push(b'h')?, Some('h'));
310        assert_eq!(parser.push(b'e')?, Some('e'));
311        assert_eq!(parser.push(b'l')?, Some('l'));
312        assert_eq!(parser.push(b'l')?, Some('l'));
313        assert_eq!(parser.push(b'o')?, Some('o'));
314        assert_eq!(parser.push(0b1101_0000)?, None);
315        Ok(())
316    }
317
318    fn parse_str_by_bytes(original: &[u8]) -> Result<String, Utf8ParserError> {
319        let mut rebuilt = String::new();
320
321        let mut parser = Utf8Parser::default();
322        for byte in original {
323            if let Some(c) = parser.push(*byte)? {
324                rebuilt.push(c);
325            }
326        }
327
328        assert_eq!(String::from_utf8(original.into()).unwrap(), rebuilt);
329
330        Ok(rebuilt)
331    }
332
333    #[test]
334    fn parse_ascii_stream() -> Result<(), Utf8ParserError> {
335        parse_str_by_bytes("The quick brown fox jamped over the lazy dog".as_bytes())?;
336        Ok(())
337    }
338
339    #[test]
340    fn parse_emoji_stream() -> Result<(), Utf8ParserError> {
341        parse_str_by_bytes("ThΓ© quick brown 🦊 jamped over the lazy πŸ•".as_bytes())?;
342        Ok(())
343    }
344
345    #[test]
346    fn reset_state_after_error() {
347        let mut parser = Utf8Parser::new();
348
349        // Push a valid start byte
350        assert!(parser.push(0b1110_0000).is_ok());
351        // Push an invalid byte
352        assert!(parser.push(0b1111_1110).is_err());
353        assert_eq!(parser.push(b'a'), Ok(Some('a')));
354    }
355
356    #[test]
357    const fn const_usage() {
358        let mut parser = Utf8Parser::new();
359
360        assert!(matches!(parser.push(0xf0), Ok(None)));
361        assert!(matches!(parser.push(0x9f), Ok(None)));
362        assert!(matches!(parser.push(0x90), Ok(None)));
363        assert!(matches!(parser.push(0x95), Ok(Some('πŸ•'))));
364    }
365
366    #[test]
367    fn error_on_overlong_encodings() {
368        let good: Vec<(&[u8], u32)> = vec![
369            // Represent 0x0 in one byte
370            (&[0b0_0000000], 0x00),
371            // Represent 0x7F in one byte
372            (&[0b0_1111111], 0x7f),
373            // Represent 0x80 in two bytes
374            (&[0b110_00010, 0b10_000000], 0x80),
375            // Represent 0x7ff in two bytes
376            (&[0b110_11111, 0b10_111111], 0x7ff),
377            // Represent 0x800 in three bytes
378            (&[0b1110_0000, 0b10_100000, 0b10_000000], 0x800),
379            // Represent 0xFFFF in three bytes
380            (&[0b1110_1111, 0b10_111111, 0b10_111111], 0xFFFF),
381            // Represent 0x10000 in four bytes
382            (
383                &[0b11110_000, 0b10_010000, 0b10_000000, 0b10_000000],
384                0x10000,
385            ),
386            // Represent 0x10FFFF in four bytes
387            (
388                &[0b11110_100, 0b10_001111, 0b10_111111, 0b10_111111],
389                0x10FFFF,
390            ),
391        ];
392        let overlong: Vec<&[u8]> = vec![
393            // Represent 0x00 in two bytes
394            &[0b110_00000, 0b10_000000],
395            // Represent 0x7F in two bytes
396            &[0b110_00001, 0b10_111111],
397            // Represent 0x00 in three bytes
398            &[0b1110_0000, 0b10_000000, 0b10_000000],
399            // Represent 0x7ff in three bytes
400            &[0b1110_0000, 0b10_011111, 0b10_111111],
401            // Represent 0x0 in four bytes
402            &[0b11110_000, 0b10_000000, 0b10_000000, 0b10_000000],
403            // Represent 0xFFFF in four bytes
404            &[0b11110_000, 0b10_001111, 0b10_000000, 0b10_111111],
405        ];
406        let err_but_not_overlong: Vec<&[u8]> = vec![
407            // Represent 0x110000 in four bytes
408            &[0b11110_110, 0b10_000000, 0b10_000000, 0b10_000000],
409        ];
410
411        for tv in good {
412            assert_eq!(
413                parse_str_by_bytes(tv.0).unwrap().chars().next().unwrap() as u32,
414                tv.1
415            );
416        }
417
418        for tv in overlong {
419            assert_eq!(
420                parse_str_by_bytes(tv).unwrap_err(),
421                Utf8ParserError::OverlongEncoding
422            );
423        }
424
425        for tv in err_but_not_overlong {
426            assert_ne!(
427                parse_str_by_bytes(tv).unwrap_err(),
428                Utf8ParserError::OverlongEncoding
429            );
430        }
431    }
432
433    #[test]
434    fn random_input_dont_panic() {
435        let mut parser = Utf8Parser::default();
436        let mut rng = rand::rng();
437        for _ in 0..1_000_000 {
438            let _ = parser.push(rng.random());
439        }
440    }
441
442    #[test]
443    fn random_ascii_dont_error() {
444        let mut parser = Utf8Parser::default();
445        let mut rng = rand::rng();
446        for _ in 0..1_000_000 {
447            let val: u8 = rng.random();
448            parser.push(val % 0x80).unwrap();
449        }
450    }
451}