lazy_char_iter/
lib.rs

1//! This crate provides a `.chars()` method for `Vec<u8>` and `&[u8]` types, allowing you to iterate over the characters
2//! in a byte vector or slice without decoding each character preemptively.
3//!
4//! In typical usage, you `use` the `CharIterExt` trait (implemented for `Vec<u8>` and `&[u8]`) and call the `.chars()`
5//! method on those types:
6//!
7//! ```
8//! use lazy_char_iter::LazyCharIterExt;
9//!
10//! let bread_str: &str = "brød";
11//! let bread_bytes: &[u8] = bread_str.as_bytes();
12//! let mut char_iter = bread_bytes.chars();
13//! assert_eq!(char_iter.next(), Some(Ok('b')));
14//! ```
15#![warn(clippy::all)]
16
17use std::{
18    error::Error,
19    fmt::{Display, Formatter, Result as FmtResult},
20    iter::{DoubleEndedIterator, FusedIterator, Iterator},
21};
22
23/// Iterate over the characters in a given type.
24pub trait LazyCharIterExt {
25    type Iter;
26
27    /// Returns an iterator over the `char`s of the given type.
28    ///
29    /// Since the underlying type is not guaranteed to be valid UTF-8, the iterator will return
30    /// `Option<Result<char, Utf8Error>>` instead of just `char`
31    ///
32    /// It's important to remember that char represents a Unicode Scalar Value, and might not match your idea of what a
33    /// ‘character’ is. Iteration over grapheme clusters may be what you actually want. This functionality is not
34    /// provided here; check (crates.io)[<https://crates.io>] instead.
35    ///
36    /// # Examples
37    ///
38    /// Basic usage:
39    ///
40    /// ```
41    /// use lazy_char_iter::LazyCharIterExt;
42    ///
43    /// let bread_str: &str = "brød";
44    /// let bread_bytes: &[u8] = bread_str.as_bytes();
45    /// assert!(bread_bytes.len() == 5); // ø is \xc3\xb8
46    ///
47    /// let mut char_iter = bread_bytes.chars();
48    /// assert_eq!(char_iter.next(), Some(Ok('b')));
49    /// assert_eq!(char_iter.next(), Some(Ok('r')));
50    /// assert_eq!(char_iter.next(), Some(Ok('ø')));
51    /// assert_eq!(char_iter.next(), Some(Ok('d')));
52    /// assert_eq!(char_iter.next(), None);
53    /// ```
54    ///
55    /// Invalid UTF-8 results in an error when the invalid character is hit:
56    ///
57    /// ```
58    /// use lazy_char_iter::{LazyCharIterExt, Utf8Error};
59    ///
60    /// let invalid = vec![b'b', b'r', b'\xc3', b'\xc3', b'd'];
61    /// let invalid_bytes: &[u8] = invalid.as_slice();
62    ///
63    /// let mut char_iter = invalid_bytes.chars();
64    /// assert_eq!(char_iter.next(), Some(Ok('b')));
65    /// assert_eq!(char_iter.next(), Some(Ok('r')));
66    /// assert_eq!(char_iter.next(), Some(Err(Utf8Error::InvalidEncoding(vec![0xc3, 0xc3]))));
67    /// ```
68    fn chars(&self) -> Self::Iter;
69}
70
71/// The resulting iterator returned when `.chars()` is called on a `&[u8]`.
72#[derive(Clone, Debug)]
73pub struct LazyCharSliceIter<'a> {
74    slice_iter: ::std::slice::Iter<'a, u8>,
75}
76
77impl<'a> LazyCharSliceIter<'a> {
78    /// Returns the undecoded remainder of the slice.
79    ///
80    /// This has the same lifetime as the original slice, so the iterator can continue to be used while this exists.
81    ///
82    /// # Examples
83    ///
84    /// Basic usage:
85    ///
86    /// ```
87    /// use lazy_char_iter::LazyCharIterExt;
88    ///
89    /// let bread_str: &str = "brød";
90    /// let bread_bytes: &[u8] = bread_str.as_bytes();
91    ///
92    /// // Iterate over the characters.
93    /// let mut char_iter = bread_bytes.chars();
94    ///
95    /// // If we print the remaining characters, we have "[98, 114, 195, 184, 100]" (the UTF-8 bytes for "brød").
96    /// assert_eq!(format!("{:?}", char_iter.remaining()), "[98, 114, 195, 184, 100]");
97    ///
98    /// // Move to the fourth character of the slice.
99    /// char_iter.next();
100    /// char_iter.next();
101    /// char_iter.next();
102    ///
103    /// // Now the remaining character is "[100]".
104    /// assert_eq!(format!("{:?}", char_iter.remaining()), "[100]");
105    /// ```
106    pub fn remaining(&self) -> &'a [u8] {
107        self.slice_iter.as_slice()
108    }
109}
110
111impl<'a> AsRef<[u8]> for LazyCharSliceIter<'a> {
112    #[inline]
113    fn as_ref(&self) -> &'a [u8] {
114        self.slice_iter.as_slice()
115    }
116}
117
118impl<'a> Iterator for LazyCharSliceIter<'a> {
119    type Item = Result<char, Utf8Error>;
120
121    fn next(&mut self) -> Option<Self::Item> {
122        let b1 = *(self.slice_iter.next()?);
123
124        let c = if b1 & 0x80 == 0 {
125            // Single byte character.
126            b1 as u32
127        } else if b1 & 0b1110_0000 == 0b1100_0000 {
128            // Two byte character.
129            let b2 = match self.slice_iter.next() {
130                Some(b) => *b,
131                None => return Some(Err(Utf8Error::Truncated(vec![b1]))),
132            };
133
134            if b2 & 0b1100_0000 != 0b1000_0000 {
135                return Some(Err(Utf8Error::InvalidEncoding(vec![b1, b2])));
136            }
137
138            let result = ((b1 & 0b0001_1111) as u32) << 6 | (b2 & 0b0011_1111) as u32;
139            if result < 0x80 {
140                return Some(Err(Utf8Error::OverlongEncoding(vec![b1, b2])));
141            }
142
143            result
144        } else if b1 & 0b1111_0000 == 0b1110_0000 {
145            // Three byte character.
146            let b2 = match self.slice_iter.next() {
147                Some(b) => *b,
148                None => return Some(Err(Utf8Error::Truncated(vec![b1]))),
149            };
150
151            if b2 & 0b1100_0000 != 0b1000_0000 {
152                return Some(Err(Utf8Error::InvalidEncoding(vec![b1, b2])));
153            }
154
155            let b3 = match self.slice_iter.next() {
156                Some(b) => *b,
157                None => return Some(Err(Utf8Error::Truncated(vec![b1, b2]))),
158            };
159
160            if b3 & 0b1100_0000 != 0b1000_0000 {
161                return Some(Err(Utf8Error::InvalidEncoding(vec![b1, b2, b3])));
162            }
163
164            let result =
165                ((b1 & 0b0000_1111) as u32) << 12 | ((b2 & 0b0011_1111) as u32) << 6 | (b3 & 0b0011_1111) as u32;
166            if result < 0x800 {
167                return Some(Err(Utf8Error::OverlongEncoding(vec![b1, b2, b3])));
168            }
169
170            result
171        } else if b1 & 0b1111_1000 == 0b1111_0000 {
172            // Four byte character.
173            let b2 = match self.slice_iter.next() {
174                Some(b) => *b,
175                None => return Some(Err(Utf8Error::Truncated(vec![b1]))),
176            };
177
178            if b2 & 0b1100_0000 != 0b1000_0000 {
179                return Some(Err(Utf8Error::InvalidEncoding(vec![b1, b2])));
180            }
181
182            let b3 = match self.slice_iter.next() {
183                Some(b) => *b,
184                None => return Some(Err(Utf8Error::Truncated(vec![b1, b2]))),
185            };
186
187            if b3 & 0b1100_0000 != 0b1000_0000 {
188                return Some(Err(Utf8Error::InvalidEncoding(vec![b1, b2, b3])));
189            }
190
191            let b4 = match self.slice_iter.next() {
192                Some(b) => *b,
193                None => return Some(Err(Utf8Error::Truncated(vec![b1, b2, b3]))),
194            };
195
196            if b4 & 0b1100_0000 != 0b1000_0000 {
197                return Some(Err(Utf8Error::InvalidEncoding(vec![b1, b2, b3, b4])));
198            }
199
200            let result = ((b1 & 0b0000_0111) as u32) << 18
201                | ((b2 & 0b0011_1111) as u32) << 12
202                | ((b3 & 0b0011_1111) as u32) << 6
203                | (b4 & 0b0011_1111) as u32;
204            if result < 0x10000 {
205                return Some(Err(Utf8Error::OverlongEncoding(vec![b1, b2, b3, b4])));
206            }
207
208            result
209        } else {
210            // Invalid lead byte.
211            return Some(Err(Utf8Error::InvalidEncoding(vec![b1])));
212        };
213
214        match char::from_u32(c) {
215            Some(c) => Some(Ok(c)),
216            None => Some(Err(Utf8Error::InvalidCodepoint(c))),
217        }
218    }
219
220    fn size_hint(&self) -> (usize, Option<usize>) {
221        // Lower bound is the number of bytes in the slice divided by 4 (the maximum number of bytes per character).
222        // Upper bound is the number of bytes in the slice.
223        let upper = self.slice_iter.len();
224        let mut lower = upper / 4;
225        if upper % 4 > 0 {
226            lower += 1;
227        }
228
229        (lower, Some(upper))
230    }
231}
232
233impl<'a> DoubleEndedIterator for LazyCharSliceIter<'a> {
234    fn next_back(&mut self) -> Option<Self::Item> {
235        let b1 = *(self.slice_iter.next_back()?);
236
237        let c = if b1 & 0b1100_0000 == 0b1000_0000 {
238            // Continuation byte (2-4 byte encoding). Find the lead byte.
239            let b2 = match self.slice_iter.next_back() {
240                Some(b) => *b,
241                None => return Some(Err(Utf8Error::Truncated(vec![b1]))),
242            };
243
244            if b2 & 0b1100_0000 == 0b1000_0000 {
245                // Second continuation byte (3-4 byte encoding). Keep scanning to find the lead byte.
246                let b3 = match self.slice_iter.next_back() {
247                    Some(b) => *b,
248                    None => return Some(Err(Utf8Error::Truncated(vec![b2, b1]))),
249                };
250
251                if b3 & 0b1100_0000 == 0b1000_0000 {
252                    // Third continuation byte (4 byte encoding). Next byte *must* be the lead byte.
253                    let b4 = match self.slice_iter.next_back() {
254                        Some(b) => *b,
255                        None => return Some(Err(Utf8Error::Truncated(vec![b3, b2, b1]))),
256                    };
257
258                    if b4 & 0b1111_1000 != 0b1111_0000 {
259                        return Some(Err(Utf8Error::InvalidEncoding(vec![b4, b3, b2, b1])));
260                    }
261
262                    let result = ((b4 & 0b0000_0111) as u32) << 18
263                        | ((b3 & 0b0011_1111) as u32) << 12
264                        | ((b2 & 0b0011_1111) as u32) << 6
265                        | (b1 & 0b0011_1111) as u32;
266                    if result < 0x10000 {
267                        return Some(Err(Utf8Error::OverlongEncoding(vec![b4, b3, b2, b1])));
268                    }
269                    result
270                } else if b3 & 0b1111_0000 != 0b1110_0000 {
271                    return Some(Err(Utf8Error::InvalidEncoding(vec![b3, b2, b1])));
272                } else {
273                    // 3 byte encoding.
274                    let result = ((b3 & 0b0001_1111) as u32) << 12
275                        | ((b2 & 0b0011_1111) as u32) << 6
276                        | (b1 & 0b0011_1111) as u32;
277                    if result < 0x800 {
278                        return Some(Err(Utf8Error::OverlongEncoding(vec![b3, b2, b1])));
279                    }
280                    result
281                }
282            } else if b2 & 0b1110_0000 != 0b1100_0000 {
283                return Some(Err(Utf8Error::InvalidEncoding(vec![b2, b1])));
284            } else {
285                let result = ((b2 & 0b0001_1111) as u32) << 6 | (b1 & 0b0011_1111) as u32;
286                if result < 0x80 {
287                    return Some(Err(Utf8Error::OverlongEncoding(vec![b2, b1])));
288                }
289                result
290            }
291        } else if b1 & 0b1000_0000 != 0b0000_0000 {
292            // Lead byte found without continuation byte(s).
293            return Some(Err(Utf8Error::InvalidEncoding(vec![b1])));
294        } else {
295            b1 as u32
296        };
297
298        match char::from_u32(c) {
299            Some(c) => Some(Ok(c)),
300            None => Some(Err(Utf8Error::InvalidCodepoint(c))),
301        }
302    }
303}
304
305impl<'a> FusedIterator for LazyCharSliceIter<'a> {}
306
307impl<'a> LazyCharIterExt for &'a [u8] {
308    type Iter = LazyCharSliceIter<'a>;
309
310    fn chars(&self) -> Self::Iter {
311        LazyCharSliceIter {
312            slice_iter: self.iter(),
313        }
314    }
315}
316
317impl<'a> LazyCharIterExt for &'a Vec<u8> {
318    type Iter = LazyCharSliceIter<'a>;
319
320    fn chars(&self) -> Self::Iter {
321        LazyCharSliceIter {
322            slice_iter: self.iter(),
323        }
324    }
325}
326
327/// The errors that can occur when decoding UTF-8.
328#[derive(Debug, Eq, PartialEq)]
329pub enum Utf8Error {
330    /// The codepoint is not a valid Unicode codepoint.
331    InvalidCodepoint(u32),
332
333    /// The encoding is not valid UTF-8.
334    InvalidEncoding(Vec<u8>),
335
336    /// The encoding is "overlong," e.g., a two-byte UTF-8 encoding of a codepoint that could be encoded in a single
337    /// byte. This is not allowed in UTF-8 for security reasons.
338    OverlongEncoding(Vec<u8>),
339
340    /// The character was truncated when being decoded.
341    Truncated(Vec<u8>),
342}
343
344impl Display for Utf8Error {
345    fn fmt(&self, f: &mut Formatter) -> FmtResult {
346        match self {
347            Utf8Error::InvalidCodepoint(c) => write!(f, "Invalid Unicode codepoint: {c:#x}"),
348            Utf8Error::InvalidEncoding(bytes) => {
349                write!(f, "Invalid UTF-8 encoding: [")?;
350                for (i, b) in bytes.iter().enumerate() {
351                    if i > 0 {
352                        write!(f, ", ")?;
353                    }
354                    write!(f, "{b:#04x}")?;
355                }
356                write!(f, "]")
357            }
358            Utf8Error::OverlongEncoding(bytes) => {
359                write!(f, "Overlong UTF-8 encoding: [")?;
360                for (i, b) in bytes.iter().enumerate() {
361                    if i > 0 {
362                        write!(f, ", ")?;
363                    }
364                    write!(f, "{b:#04x}")?;
365                }
366                write!(f, "]")
367            }
368            Utf8Error::Truncated(bytes) => {
369                write!(f, "Truncated UTF-8 character: [")?;
370                for (i, b) in bytes.iter().enumerate() {
371                    if i > 0 {
372                        write!(f, ", ")?;
373                    }
374                    write!(f, "{b:#04x}")?;
375                }
376                write!(f, "]")
377            }
378        }
379    }
380}
381
382impl Error for Utf8Error {}
383
384#[cfg(test)]
385mod tests {
386    use super::{LazyCharIterExt, Utf8Error};
387
388    #[test]
389    fn test_good() {
390        // Characters: a \u{0061}, á \u{00e1}, ḁ \u{1e01}, 𝒜 \u{1d49c}
391        let v = vec![0x61, 0xc3, 0xa1, 0xe1, 0xb8, 0x81, 0xf0, 0x9d, 0x92, 0x9c];
392        let mut chars = (&v).chars();
393        let mut chars2 = chars.clone();
394        assert_eq!(chars.next(), Some(Ok('a')));
395        assert_eq!(chars.remaining(), &vec![0xc3, 0xa1, 0xe1, 0xb8, 0x81, 0xf0, 0x9d, 0x92, 0x9c]);
396        assert_eq!(chars.as_ref(), &vec![0xc3, 0xa1, 0xe1, 0xb8, 0x81, 0xf0, 0x9d, 0x92, 0x9c]);
397        assert_eq!(chars.next(), Some(Ok('á')));
398        assert_eq!(chars.remaining(), &vec![0xe1, 0xb8, 0x81, 0xf0, 0x9d, 0x92, 0x9c]);
399        assert_eq!(chars.as_ref(), &vec![0xe1, 0xb8, 0x81, 0xf0, 0x9d, 0x92, 0x9c]);
400        assert_eq!(chars.next(), Some(Ok('ḁ')));
401        assert_eq!(chars.remaining(), &vec![0xf0, 0x9d, 0x92, 0x9c]);
402        assert_eq!(chars.as_ref(), &vec![0xf0, 0x9d, 0x92, 0x9c]);
403        assert_eq!(chars.next(), Some(Ok('𝒜')));
404        assert_eq!(chars.remaining(), &vec![]);
405        assert_eq!(chars.as_ref(), &vec![]);
406        assert_eq!(chars.next(), None);
407        assert_eq!(chars2.next_back(), Some(Ok('𝒜')));
408        assert_eq!(chars2.remaining(), &vec![0x61, 0xc3, 0xa1, 0xe1, 0xb8, 0x81]);
409        assert_eq!(chars2.as_ref(), &vec![0x61, 0xc3, 0xa1, 0xe1, 0xb8, 0x81]);
410        assert_eq!(chars2.next_back(), Some(Ok('ḁ')));
411        assert_eq!(chars2.remaining(), &vec![0x61, 0xc3, 0xa1]);
412        assert_eq!(chars2.as_ref(), &vec![0x61, 0xc3, 0xa1]);
413        assert_eq!(chars2.next_back(), Some(Ok('á')));
414        assert_eq!(chars2.remaining(), &vec![0x61]);
415        assert_eq!(chars2.as_ref(), &vec![0x61]);
416        assert_eq!(chars2.next_back(), Some(Ok('a')));
417        assert_eq!(chars2.remaining(), &vec![]);
418        assert_eq!(chars2.as_ref(), &vec![]);
419        assert_eq!(chars2.next_back(), None);
420    }
421
422    #[test]
423    fn test_empty_vec() {
424        let v = vec![];
425        let mut chars = (&v).chars();
426        assert!(chars.next().is_none());
427
428        assert_eq!(chars.size_hint(), (0, Some(0)));
429
430        // Make sure we can debug print.
431        let _ = format!("{chars:?}");
432    }
433
434    #[test]
435    fn test_truncated_forward() {
436        // Two byte encoding
437        let v = vec![0xc0];
438        let mut chars = (&v).chars();
439        let e = chars.next().unwrap().unwrap_err();
440        assert_eq!(e, Utf8Error::Truncated(vec![0xc0]));
441
442        // Three byte encoding
443        let v = vec![0xe0];
444        let mut chars = (&v).chars();
445        let e = chars.next().unwrap().unwrap_err();
446        assert_eq!(e, Utf8Error::Truncated(vec![0xe0]));
447
448        let v = vec![0xe0, 0x80];
449        let mut chars = (&v).chars();
450        let e = chars.next().unwrap().unwrap_err();
451        assert_eq!(e, Utf8Error::Truncated(vec![0xe0, 0x80]));
452
453        // Four byte encoding
454        let v = vec![0xf0];
455        let mut chars = (&v).chars();
456        let e = chars.next().unwrap().unwrap_err();
457        assert_eq!(e, Utf8Error::Truncated(vec![0xf0]));
458
459        let v = vec![0xf0, 0x80];
460        let mut chars = (&v).chars();
461        let e = chars.next().unwrap().unwrap_err();
462        assert_eq!(e, Utf8Error::Truncated(vec![0xf0, 0x80]));
463
464        let v = vec![0xf0, 0x80, 0x80];
465        let mut chars = (&v).chars();
466        let e = chars.next().unwrap().unwrap_err();
467        assert_eq!(e, Utf8Error::Truncated(vec![0xf0, 0x80, 0x80]));
468    }
469
470    #[test]
471    fn test_truncated_backward() {
472        let v = vec![0x80];
473        let mut chars = (&v).chars();
474        let e = chars.next_back().unwrap().unwrap_err();
475        assert_eq!(e, Utf8Error::Truncated(vec![0x80]));
476        assert_eq!(format!("{e:?}"), "Truncated([128])");
477        assert_eq!(format!("{e}"), "Truncated UTF-8 character: [0x80]");
478
479        let v = vec![0x80, 0x80];
480        let mut chars = (&v).chars();
481        let e = chars.next_back().unwrap().unwrap_err();
482        assert_eq!(e, Utf8Error::Truncated(vec![0x80, 0x80]));
483        assert_eq!(format!("{e:?}"), "Truncated([128, 128])");
484        assert_eq!(format!("{e}"), "Truncated UTF-8 character: [0x80, 0x80]");
485
486        let v = vec![0x80, 0x80, 0x80];
487        let mut chars = (&v).chars();
488        let e = chars.next_back().unwrap().unwrap_err();
489        assert_eq!(e, Utf8Error::Truncated(vec![0x80, 0x80, 0x80]));
490        assert_eq!(format!("{e}"), "Truncated UTF-8 character: [0x80, 0x80, 0x80]");
491
492        let v = vec![0x80, 0x80, 0x80, 0x80];
493        let mut chars = (&v).chars();
494        let e = chars.next_back().unwrap().unwrap_err();
495        assert_eq!(e, Utf8Error::InvalidEncoding(vec![0x80, 0x80, 0x80, 0x80]));
496        assert_eq!(format!("{e:?}"), "InvalidEncoding([128, 128, 128, 128])");
497        assert_eq!(format!("{e}"), "Invalid UTF-8 encoding: [0x80, 0x80, 0x80, 0x80]");
498    }
499
500    #[test]
501    fn test_overlong() {
502        // Two bytes
503        let v = vec![0xc0, 0x80, 0x00];
504        let mut chars = (&v).chars();
505        assert_eq!(chars.size_hint(), (1, Some(3)));
506        let e = chars.next().unwrap().unwrap_err();
507        assert_eq!(e, Utf8Error::OverlongEncoding(vec![0xc0, 0x80]));
508        assert_eq!(format!("{e:?}"), "OverlongEncoding([192, 128])");
509        assert_eq!(format!("{e}"), "Overlong UTF-8 encoding: [0xc0, 0x80]");
510
511        let v = vec![0x00, 0xc0, 0x80];
512        let mut chars = (&v).chars();
513        assert_eq!(chars.size_hint(), (1, Some(3)));
514        let e = chars.next_back().unwrap().unwrap_err();
515        assert_eq!(e, Utf8Error::OverlongEncoding(vec![0xc0, 0x80]));
516        assert_eq!(format!("{e:?}"), "OverlongEncoding([192, 128])");
517        assert_eq!(format!("{e}"), "Overlong UTF-8 encoding: [0xc0, 0x80]");
518
519        // Three bytes
520        let v = vec![0xe0, 0x80, 0x80, 0x00];
521        let mut chars = (&v).chars();
522        assert_eq!(chars.size_hint(), (1, Some(4)));
523        let e = chars.next().unwrap().unwrap_err();
524        assert_eq!(e, Utf8Error::OverlongEncoding(vec![0xe0, 0x80, 0x80]));
525        assert_eq!(format!("{e:?}"), "OverlongEncoding([224, 128, 128])");
526        assert_eq!(format!("{e}"), "Overlong UTF-8 encoding: [0xe0, 0x80, 0x80]");
527
528        let v = vec![0x00, 0xe0, 0x80, 0x80];
529        let mut chars = (&v).chars();
530        assert_eq!(chars.size_hint(), (1, Some(4)));
531        let e = chars.next_back().unwrap().unwrap_err();
532        assert_eq!(e, Utf8Error::OverlongEncoding(vec![0xe0, 0x80, 0x80]));
533        assert_eq!(format!("{e:?}"), "OverlongEncoding([224, 128, 128])");
534        assert_eq!(format!("{e}"), "Overlong UTF-8 encoding: [0xe0, 0x80, 0x80]");
535
536        // Four bytes
537        let v = vec![0xf0, 0x80, 0x80, 0x80, 0x00];
538        let mut chars = (&v).chars();
539        assert_eq!(chars.size_hint(), (2, Some(5)));
540        let e = chars.next().unwrap().unwrap_err();
541        assert_eq!(e, Utf8Error::OverlongEncoding(vec![0xf0, 0x80, 0x80, 0x80]));
542        assert_eq!(format!("{e:?}"), "OverlongEncoding([240, 128, 128, 128])");
543        assert_eq!(format!("{e}"), "Overlong UTF-8 encoding: [0xf0, 0x80, 0x80, 0x80]");
544
545        let v = vec![0x00, 0xf0, 0x80, 0x80, 0x80];
546        let mut chars = (&v).chars();
547        assert_eq!(chars.size_hint(), (2, Some(5)));
548        let e = chars.next_back().unwrap().unwrap_err();
549        assert_eq!(e, Utf8Error::OverlongEncoding(vec![0xf0, 0x80, 0x80, 0x80]));
550        assert_eq!(format!("{e:?}"), "OverlongEncoding([240, 128, 128, 128])");
551        assert_eq!(format!("{e}"), "Overlong UTF-8 encoding: [0xf0, 0x80, 0x80, 0x80]");
552    }
553
554    #[test]
555    fn test_invalid_codepoint() {
556        let v = vec![0xf4, 0x90, 0x80, 0x80];
557        let mut chars = (&v).chars();
558        assert_eq!(chars.size_hint(), (1, Some(4)));
559        let e = chars.next().unwrap().unwrap_err();
560        assert_eq!(e, Utf8Error::InvalidCodepoint(0x110000));
561        assert_eq!(format!("{e:?}"), "InvalidCodepoint(1114112)");
562        assert_eq!(format!("{e}"), "Invalid Unicode codepoint: 0x110000");
563    }
564
565    #[test]
566    fn test_invalid_encoding() {
567        let v = vec![0x80, 0x0, 0x0];
568        let mut chars = (&v).chars();
569        let mut chars2 = chars.clone();
570        assert_eq!(chars.size_hint(), (1, Some(3)));
571        assert_eq!(chars.next(), Some(Err(Utf8Error::InvalidEncoding(vec![0x80]))));
572        assert_eq!(chars2.next_back(), Some(Ok('\0')));
573        assert_eq!(chars2.next_back(), Some(Ok('\0')));
574        assert_eq!(chars2.next_back(), Some(Err(Utf8Error::Truncated(vec![0x80]))));
575
576        let v = vec![0xc0, 0xc3, 0xbf];
577        let mut chars = (&v).chars();
578        let mut chars2 = chars.clone();
579        assert_eq!(chars.size_hint(), (1, Some(3)));
580        assert_eq!(chars.next(), Some(Err(Utf8Error::InvalidEncoding(vec![0xc0, 0xc3]))));
581        assert_eq!(chars2.next_back(), Some(Ok('ÿ')));
582        assert_eq!(chars2.next_back(), Some(Err(Utf8Error::InvalidEncoding(vec![0xc0]))));
583
584        let v = vec![0xe0, 0xc3, 0xbf];
585        let mut chars = (&v).chars();
586        let mut chars2 = chars.clone();
587        assert_eq!(chars.size_hint(), (1, Some(3)));
588        assert_eq!(chars.next(), Some(Err(Utf8Error::InvalidEncoding(vec![0xe0, 0xc3]))));
589        assert_eq!(chars2.next_back(), Some(Ok('ÿ')));
590        assert_eq!(chars2.next_back(), Some(Err(Utf8Error::InvalidEncoding(vec![0xe0]))));
591
592        let v = vec![0xe0, 0x80, 0xc0];
593        let mut chars = (&v).chars();
594        let mut chars2 = chars.clone();
595        assert_eq!(chars.size_hint(), (1, Some(3)));
596        assert_eq!(chars.next(), Some(Err(Utf8Error::InvalidEncoding(vec![0xe0, 0x80, 0xc0]))));
597        assert_eq!(chars2.next_back(), Some(Err(Utf8Error::InvalidEncoding(vec![0xc0]))));
598        assert_eq!(chars2.next_back(), Some(Err(Utf8Error::InvalidEncoding(vec![0xe0, 0x80]))));
599
600        let v = vec![0xf0, 0xc0, 0x80, 0x80];
601        let mut chars = (&v).chars();
602        let mut chars2 = chars.clone();
603        assert_eq!(chars.size_hint(), (1, Some(4)));
604        assert_eq!(chars.next(), Some(Err(Utf8Error::InvalidEncoding(vec![0xf0, 0xc0]))));
605        assert_eq!(chars2.next_back(), Some(Err(Utf8Error::InvalidEncoding(vec![0xc0, 0x80, 0x80]))));
606
607        let v = vec![0xf0, 0x80, 0xc3, 0xbf];
608        let mut chars = (&v).chars();
609        let mut chars2 = chars.clone();
610        assert_eq!(chars.size_hint(), (1, Some(4)));
611        assert_eq!(chars.next(), Some(Err(Utf8Error::InvalidEncoding(vec![0xf0, 0x80, 0xc3]))));
612        assert_eq!(chars2.next_back(), Some(Ok('ÿ')));
613        assert_eq!(chars2.next_back(), Some(Err(Utf8Error::InvalidEncoding(vec![0xf0, 0x80]))));
614
615        let v = vec![0xf0, 0x80, 0x80, 0xc0];
616        let mut chars = (&v).chars();
617        let mut chars2 = chars.clone();
618        assert_eq!(chars.size_hint(), (1, Some(4)));
619        assert_eq!(chars.next(), Some(Err(Utf8Error::InvalidEncoding(vec![0xf0, 0x80, 0x80, 0xc0]))));
620        assert_eq!(chars2.next_back(), Some(Err(Utf8Error::InvalidEncoding(vec![0xc0]))));
621        assert_eq!(chars2.next_back(), Some(Err(Utf8Error::InvalidEncoding(vec![0xf0, 0x80, 0x80]))));
622    }
623
624    #[test]
625    fn test_double_ended() {
626        let bread = "brød";
627        let bread_bytes = bread.as_bytes();
628        let mut chars = bread_bytes.chars();
629        assert_eq!(chars.next_back(), Some(Ok('d')));
630        assert_eq!(chars.next_back(), Some(Ok('ø')));
631        assert_eq!(chars.next(), Some(Ok('b')));
632        assert_eq!(chars.next_back(), Some(Ok('r')));
633        assert_eq!(chars.next_back(), None);
634    }
635
636    #[test]
637    fn test_double_ended_invalid_codepoint() {
638        let v = vec![0x0, 0xf7, 0xaf, 0xaf, 0xaf];
639        let mut chars = (&v).chars();
640        assert_eq!(chars.next_back(), Some(Err(Utf8Error::InvalidCodepoint(0x1efbef))));
641    }
642}