moore_common/grind/
utf8.rs

1// Copyright (c) 2016-2021 Fabian Schuiki
2
3//! A UTF8 parser that keeps track of the index and size of the characters it
4//! emits. This parser does not generate any errors, but rather replaces invalid
5//! encoding in the input with the `U+FFFD REPLACEMENT CHARACTER`.
6
7use crate::grind::Grinder;
8use std;
9
10pub struct Utf8<T: Grinder<Item = Option<u8>>> {
11    inner: T,
12    current: usize,
13    peek: Option<u8>,
14}
15
16impl<T> Utf8<T>
17where
18    T: Grinder<Item = Option<u8>>,
19{
20    /// Create a new UTF8 parser.
21    pub fn new(mut inner: T) -> Utf8<T> {
22        let peek = inner.next();
23        Utf8 {
24            inner: inner,
25            current: 0,
26            peek: peek,
27        }
28    }
29
30    /// Advance the input by one position.
31    fn bump(&mut self) {
32        self.peek = self.inner.next();
33        self.current += 1;
34    }
35}
36
37impl<T> Grinder for Utf8<T>
38where
39    T: Grinder<Item = Option<u8>>,
40{
41    type Item = Option<(usize, char, u8)>;
42    type Error = T::Error;
43
44    fn next(&mut self) -> Self::Item {
45        let offset = self.current;
46        let lead = match self.peek {
47            Some(c) => c,
48            None => return None,
49        };
50        self.bump();
51        let indicator = (!lead).leading_zeros() as u8;
52        match indicator {
53            // ASCII
54            0 => Some((offset, lead as char, 1)),
55
56            // Continuation byte without preceeding leading byte
57            1 => Some((offset, '�', 1)),
58
59            // Multi-byte encoding
60            2 | 3 | 4 => {
61                let mut v = lead as u32 & (0xFF >> indicator);
62                for i in 1..indicator {
63                    match self.peek {
64                        Some(c) if (c >> 6 == 2) => v = (v << 6) | (c as u32 & 0x3F),
65                        _ => return Some((offset, '�', i)),
66                    }
67                    self.bump();
68                }
69                Some((offset, std::char::from_u32(v).unwrap_or('�'), indicator))
70            }
71
72            // Invalid number of leading ones.
73            _ => Some((offset, '�', 1)),
74        }
75    }
76
77    fn emit(&mut self, err: Self::Error) {
78        self.inner.emit(err)
79    }
80}