text_fx/
utf8.rs

1/// Represents a chunk yielded by [`Utf8Iter`]: either a valid Unicode character or an invalid byte.
2#[derive(Debug, Clone, PartialEq, Eq)]
3pub enum Utf8Chunk {
4    /// A valid Unicode character.
5    Valid(char),
6    /// An invalid byte that could not be decoded as UTF-8.
7    Invalid(u8),
8}
9
10/// An iterator over a byte slice that yields valid Unicode characters or invalid bytes.
11///
12/// This iterator attempts to decode UTF-8 from the input byte slice. For each step:
13/// - If a valid UTF-8 character is found, it yields `Utf8Chunk::Valid(char)`.
14/// - If an invalid byte is encountered, it yields `Utf8Chunk::Invalid(u8)` and advances by one byte.
15///
16/// This is useful for robustly processing possibly-invalid UTF-8 data.
17///
18/// # Examples
19///
20/// ```
21/// use text_fx::utf8::{Utf8Iter, Utf8Chunk};
22///
23/// let bytes = b"he\xffllo";
24/// let mut iter = Utf8Iter::new(bytes);
25/// assert_eq!(iter.next(), Some(Utf8Chunk::Valid('h')));
26/// assert_eq!(iter.next(), Some(Utf8Chunk::Valid('e')));
27/// assert_eq!(iter.next(), Some(Utf8Chunk::Invalid(0xff)));
28/// assert_eq!(iter.next(), Some(Utf8Chunk::Valid('l')));
29/// ```
30pub struct Utf8Iter<'a> {
31    bytes: &'a [u8],
32    pos: usize,
33}
34
35impl<'a> Utf8Iter<'a> {
36    /// Create a new `Utf8Iter` from a byte slice.
37    pub fn new(bytes: &'a [u8]) -> Self {
38        Utf8Iter { bytes, pos: 0 }
39    }
40}
41
42impl<'a> Iterator for Utf8Iter<'a> {
43    type Item = Utf8Chunk;
44
45    fn next(&mut self) -> Option<Self::Item> {
46        if self.pos >= self.bytes.len() {
47            return None;
48        }
49
50        let remaining = &self.bytes[self.pos..];
51
52        match std::str::from_utf8(remaining) {
53            Ok(valid) => {
54                // All valid — take one char and go on
55                let mut chars = valid.chars();
56                if let Some(c) = chars.next() {
57                    self.pos += c.len_utf8();
58                    Some(Utf8Chunk::Valid(c))
59                } else {
60                    None
61                }
62            }
63            Err(err) => {
64                let valid_up_to = err.valid_up_to();
65                if valid_up_to > 0 {
66                    // SAFETY: guaranteed valid range
67                    let valid_str =
68                        unsafe { std::str::from_utf8_unchecked(&remaining[..valid_up_to]) };
69                    let c = valid_str.chars().next().unwrap();
70                    self.pos += c.len_utf8();
71                    Some(Utf8Chunk::Valid(c))
72                } else {
73                    // First byte is invalid
74                    let b = self.bytes[self.pos];
75                    self.pos += 1;
76                    Some(Utf8Chunk::Invalid(b))
77                }
78            }
79        }
80    }
81}
82
83#[cfg(test)]
84mod tests {
85    use super::*;
86
87    #[test]
88    fn test_utf8_iter() {
89        let bytes = b"hello \xF0\x90world\xC3(";
90
91        for chunk in Utf8Iter::new(bytes) {
92            match chunk {
93                Utf8Chunk::Valid(c) => print!("{}", c),
94                Utf8Chunk::Invalid(b) => print!("�(0x{:02X})", b),
95            }
96        }
97    }
98}