1pub enum Utf8Chunk {
2 Valid(char),
3 Invalid(u8),
4}
5
6pub struct Utf8Iter<'a> {
7 bytes: &'a [u8],
8 pos: usize,
9}
10
11impl<'a> Utf8Iter<'a> {
12 pub fn new(bytes: &'a [u8]) -> Self {
13 Utf8Iter { bytes, pos: 0 }
14 }
15}
16
17impl<'a> Iterator for Utf8Iter<'a> {
18 type Item = Utf8Chunk;
19
20 fn next(&mut self) -> Option<Self::Item> {
21 if self.pos >= self.bytes.len() {
22 return None;
23 }
24
25 let remaining = &self.bytes[self.pos..];
26
27 match std::str::from_utf8(remaining) {
28 Ok(valid) => {
29 let mut chars = valid.chars();
31 if let Some(c) = chars.next() {
32 self.pos += c.len_utf8();
33 Some(Utf8Chunk::Valid(c))
34 } else {
35 None
36 }
37 }
38 Err(err) => {
39 let valid_up_to = err.valid_up_to();
40 if valid_up_to > 0 {
41 let valid_str =
43 unsafe { std::str::from_utf8_unchecked(&remaining[..valid_up_to]) };
44 let c = valid_str.chars().next().unwrap();
45 self.pos += c.len_utf8();
46 Some(Utf8Chunk::Valid(c))
47 } else {
48 let b = self.bytes[self.pos];
50 self.pos += 1;
51 Some(Utf8Chunk::Invalid(b))
52 }
53 }
54 }
55 }
56}
57
58#[cfg(test)]
59mod tests {
60 use super::*;
61
62 #[test]
63 fn test_utf8_iter() {
64 let bytes = b"hello \xF0\x90world\xC3(";
65
66 for chunk in Utf8Iter::new(bytes) {
67 match chunk {
68 Utf8Chunk::Valid(c) => print!("{}", c),
69 Utf8Chunk::Invalid(b) => print!("�(0x{:02X})", b),
70 }
71 }
72 }
73}