text_fx/utf8.rs
1/// Represents a chunk yielded by [`Utf8Iter`]: either a valid Unicode character or an invalid byte.
2#[derive(Debug, Clone, PartialEq, Eq)]
3pub enum Utf8Chunk {
4 /// A valid Unicode character.
5 Valid(char),
6 /// An invalid byte that could not be decoded as UTF-8.
7 Invalid(u8),
8}
9
10/// An iterator over a byte slice that yields valid Unicode characters or invalid bytes.
11///
12/// This iterator attempts to decode UTF-8 from the input byte slice. For each step:
13/// - If a valid UTF-8 character is found, it yields `Utf8Chunk::Valid(char)`.
14/// - If an invalid byte is encountered, it yields `Utf8Chunk::Invalid(u8)` and advances by one byte.
15///
16/// This is useful for robustly processing possibly-invalid UTF-8 data.
17///
18/// # Examples
19///
20/// ```
21/// use text_fx::utf8::{Utf8Iter, Utf8Chunk};
22///
23/// let bytes = b"he\xffllo";
24/// let mut iter = Utf8Iter::new(bytes);
25/// assert_eq!(iter.next(), Some(Utf8Chunk::Valid('h')));
26/// assert_eq!(iter.next(), Some(Utf8Chunk::Valid('e')));
27/// assert_eq!(iter.next(), Some(Utf8Chunk::Invalid(0xff)));
28/// assert_eq!(iter.next(), Some(Utf8Chunk::Valid('l')));
29/// ```
30pub struct Utf8Iter<'a> {
31 bytes: &'a [u8],
32 pos: usize,
33}
34
35impl<'a> Utf8Iter<'a> {
36 /// Create a new `Utf8Iter` from a byte slice.
37 pub fn new(bytes: &'a [u8]) -> Self {
38 Utf8Iter { bytes, pos: 0 }
39 }
40}
41
42impl<'a> Iterator for Utf8Iter<'a> {
43 type Item = Utf8Chunk;
44
45 fn next(&mut self) -> Option<Self::Item> {
46 if self.pos >= self.bytes.len() {
47 return None;
48 }
49
50 let remaining = &self.bytes[self.pos..];
51
52 match std::str::from_utf8(remaining) {
53 Ok(valid) => {
54 // All valid — take one char and go on
55 let mut chars = valid.chars();
56 if let Some(c) = chars.next() {
57 self.pos += c.len_utf8();
58 Some(Utf8Chunk::Valid(c))
59 } else {
60 None
61 }
62 }
63 Err(err) => {
64 let valid_up_to = err.valid_up_to();
65 if valid_up_to > 0 {
66 // SAFETY: guaranteed valid range
67 let valid_str =
68 unsafe { std::str::from_utf8_unchecked(&remaining[..valid_up_to]) };
69 let c = valid_str.chars().next().unwrap();
70 self.pos += c.len_utf8();
71 Some(Utf8Chunk::Valid(c))
72 } else {
73 // First byte is invalid
74 let b = self.bytes[self.pos];
75 self.pos += 1;
76 Some(Utf8Chunk::Invalid(b))
77 }
78 }
79 }
80 }
81}
82
83#[cfg(test)]
84mod tests {
85 use super::*;
86
87 #[test]
88 fn test_utf8_iter() {
89 let bytes = b"hello \xF0\x90world\xC3(";
90
91 for chunk in Utf8Iter::new(bytes) {
92 match chunk {
93 Utf8Chunk::Valid(c) => print!("{}", c),
94 Utf8Chunk::Invalid(b) => print!("�(0x{:02X})", b),
95 }
96 }
97 }
98}