text_fx/
utf8.rs

1pub enum Utf8Chunk {
2    Valid(char),
3    Invalid(u8),
4}
5
6pub struct Utf8Iter<'a> {
7    bytes: &'a [u8],
8    pos: usize,
9}
10
11impl<'a> Utf8Iter<'a> {
12    pub fn new(bytes: &'a [u8]) -> Self {
13        Utf8Iter { bytes, pos: 0 }
14    }
15}
16
17impl<'a> Iterator for Utf8Iter<'a> {
18    type Item = Utf8Chunk;
19
20    fn next(&mut self) -> Option<Self::Item> {
21        if self.pos >= self.bytes.len() {
22            return None;
23        }
24
25        let remaining = &self.bytes[self.pos..];
26
27        match std::str::from_utf8(remaining) {
28            Ok(valid) => {
29                // All valid — take one char and go on
30                let mut chars = valid.chars();
31                if let Some(c) = chars.next() {
32                    self.pos += c.len_utf8();
33                    Some(Utf8Chunk::Valid(c))
34                } else {
35                    None
36                }
37            }
38            Err(err) => {
39                let valid_up_to = err.valid_up_to();
40                if valid_up_to > 0 {
41                    // SAFETY: guaranteed valid range
42                    let valid_str =
43                        unsafe { std::str::from_utf8_unchecked(&remaining[..valid_up_to]) };
44                    let c = valid_str.chars().next().unwrap();
45                    self.pos += c.len_utf8();
46                    Some(Utf8Chunk::Valid(c))
47                } else {
48                    // First byte is invalid
49                    let b = self.bytes[self.pos];
50                    self.pos += 1;
51                    Some(Utf8Chunk::Invalid(b))
52                }
53            }
54        }
55    }
56}
57
58#[cfg(test)]
59mod tests {
60    use super::*;
61
62    #[test]
63    fn test_utf8_iter() {
64        let bytes = b"hello \xF0\x90world\xC3(";
65
66        for chunk in Utf8Iter::new(bytes) {
67            match chunk {
68                Utf8Chunk::Valid(c) => print!("{}", c),
69                Utf8Chunk::Invalid(b) => print!("�(0x{:02X})", b),
70            }
71        }
72    }
73}