1pub(crate) fn is_word_byte(b: u8) -> bool {
4    const fn mkwordset() -> [bool; 256] {
5        let mut set = [false; 256];
7        set[b'_' as usize] = true;
8
9        let mut byte = b'0';
10        while byte <= b'9' {
11            set[byte as usize] = true;
12            byte += 1;
13        }
14        byte = b'A';
15        while byte <= b'Z' {
16            set[byte as usize] = true;
17            byte += 1;
18        }
19        byte = b'a';
20        while byte <= b'z' {
21            set[byte as usize] = true;
22            byte += 1;
23        }
24        set
25    }
26    const WORD: [bool; 256] = mkwordset();
27    WORD[b as usize]
28}
29
30const ACCEPT: usize = 12;
33const REJECT: usize = 0;
36
37pub(crate) fn decode_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) {
40    match decode(slice) {
41        (Some(ch), size) => (ch, size),
42        (None, size) => ('\u{FFFD}', size),
43    }
44}
45
46pub(crate) fn decode<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) {
57    let slice = slice.as_ref();
58    match slice.get(0) {
59        None => return (None, 0),
60        Some(&b) if b <= 0x7F => return (Some(b as char), 1),
61        _ => {}
62    }
63
64    let (mut state, mut cp, mut i) = (ACCEPT, 0, 0);
65    while i < slice.len() {
66        decode_step(&mut state, &mut cp, slice[i]);
67        i += 1;
68
69        if state == ACCEPT {
70            let ch = char::from_u32(cp).unwrap();
76            return (Some(ch), i);
77        } else if state == REJECT {
78            return (None, core::cmp::max(1, i.saturating_sub(1)));
80        }
81    }
82    (None, i)
83}
84
85fn decode_step(state: &mut usize, cp: &mut u32, b: u8) {
87    #[rustfmt::skip]
91    const CLASSES: [u8; 256] = [
92       0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
93       0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
94       0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
95       0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
96       1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
97       7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
98       8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
99      10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
100    ];
101
102    #[rustfmt::skip]
105    const STATES_FORWARD: &'static [u8] = &[
106      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
107      12, 0, 24, 36, 60, 96, 84, 0, 0, 0, 48, 72,
108      0, 12, 0, 0, 0, 0, 0, 12, 0, 12, 0, 0,
109      0, 24, 0, 0, 0, 0, 0, 24, 0, 24, 0, 0,
110      0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0,
111      0, 24, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0,
112      0, 0, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
113      0, 36, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
114      0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
115    ];
116
117    let class = CLASSES[usize::from(b)];
118    if *state == ACCEPT {
119        *cp = (0xFF >> class) & (b as u32);
120    } else {
121        *cp = (b as u32 & 0b111111) | (*cp << 6);
122    }
123    *state = usize::from(STATES_FORWARD[*state + usize::from(class)]);
124}
125
126#[cfg(test)]
127mod tests {
128    use alloc::{vec, vec::Vec};
129
130    use super::*;
131
132    #[test]
133    fn decode_valid() {
134        fn d(mut s: &str) -> Vec<char> {
135            let mut chars = vec![];
136            while !s.is_empty() {
137                let (ch, size) = decode(s.as_bytes());
138                s = &s[size..];
139                chars.push(ch.unwrap());
140            }
141            chars
142        }
143
144        assert_eq!(vec!['☃'], d("☃"));
145        assert_eq!(vec!['☃', '☃'], d("☃☃"));
146        assert_eq!(vec!['α', 'β', 'γ', 'δ', 'ε'], d("αβγδε"));
147        assert_eq!(vec!['☃', '⛄', '⛇'], d("☃⛄⛇"));
148        assert_eq!(vec!['𝗮', '𝗯', '𝗰', '𝗱', '𝗲'], d("𝗮𝗯𝗰𝗱𝗲"));
149    }
150
151    #[test]
152    fn decode_invalid() {
153        let (ch, size) = decode(b"");
154        assert_eq!(None, ch);
155        assert_eq!(0, size);
156
157        let (ch, size) = decode(b"\xFF");
158        assert_eq!(None, ch);
159        assert_eq!(1, size);
160
161        let (ch, size) = decode(b"\xCE\xF0");
162        assert_eq!(None, ch);
163        assert_eq!(1, size);
164
165        let (ch, size) = decode(b"\xE2\x98\xF0");
166        assert_eq!(None, ch);
167        assert_eq!(2, size);
168
169        let (ch, size) = decode(b"\xF0\x9D\x9D");
170        assert_eq!(None, ch);
171        assert_eq!(3, size);
172
173        let (ch, size) = decode(b"\xF0\x9D\x9D\xF0");
174        assert_eq!(None, ch);
175        assert_eq!(3, size);
176
177        let (ch, size) = decode(b"\xF0\x82\x82\xAC");
178        assert_eq!(None, ch);
179        assert_eq!(1, size);
180
181        let (ch, size) = decode(b"\xED\xA0\x80");
182        assert_eq!(None, ch);
183        assert_eq!(1, size);
184
185        let (ch, size) = decode(b"\xCEa");
186        assert_eq!(None, ch);
187        assert_eq!(1, size);
188
189        let (ch, size) = decode(b"\xE2\x98a");
190        assert_eq!(None, ch);
191        assert_eq!(2, size);
192
193        let (ch, size) = decode(b"\xF0\x9D\x9Ca");
194        assert_eq!(None, ch);
195        assert_eq!(3, size);
196    }
197
198    #[test]
199    fn decode_lossily() {
200        let (ch, size) = decode_lossy(b"");
201        assert_eq!('\u{FFFD}', ch);
202        assert_eq!(0, size);
203
204        let (ch, size) = decode_lossy(b"\xFF");
205        assert_eq!('\u{FFFD}', ch);
206        assert_eq!(1, size);
207
208        let (ch, size) = decode_lossy(b"\xCE\xF0");
209        assert_eq!('\u{FFFD}', ch);
210        assert_eq!(1, size);
211
212        let (ch, size) = decode_lossy(b"\xE2\x98\xF0");
213        assert_eq!('\u{FFFD}', ch);
214        assert_eq!(2, size);
215
216        let (ch, size) = decode_lossy(b"\xF0\x9D\x9D\xF0");
217        assert_eq!('\u{FFFD}', ch);
218        assert_eq!(3, size);
219
220        let (ch, size) = decode_lossy(b"\xF0\x82\x82\xAC");
221        assert_eq!('\u{FFFD}', ch);
222        assert_eq!(1, size);
223
224        let (ch, size) = decode_lossy(b"\xED\xA0\x80");
225        assert_eq!('\u{FFFD}', ch);
226        assert_eq!(1, size);
227
228        let (ch, size) = decode_lossy(b"\xCEa");
229        assert_eq!('\u{FFFD}', ch);
230        assert_eq!(1, size);
231
232        let (ch, size) = decode_lossy(b"\xE2\x98a");
233        assert_eq!('\u{FFFD}', ch);
234        assert_eq!(2, size);
235
236        let (ch, size) = decode_lossy(b"\xF0\x9D\x9Ca");
237        assert_eq!('\u{FFFD}', ch);
238        assert_eq!(3, size);
239    }
240}