Skip to main content

utf8_decode/
fallible.rs

1use crate::Utf8Error;
2
3/// UTF-8 decoder iterator, with fallible source.
4///
5/// Transforms the given [`Result<u8, E>`] iterator into a [`Result<char, E>`]
6/// iterator.
7///
8/// ## Example
9///
10/// The `TryDecoder` iterator can be used, for instance, to decode UTF-8 encoded files.
11///
12/// ```rust
13/// # use std::{fs::File, io::Read};
14/// use utf8_decode::TryDecoder;
15/// # fn main() -> std::io::Result<()> {
16/// let file = File::open("examples/file.txt")?;
17///
18/// let decoder = TryDecoder::new(file.bytes());
19///
20/// let mut string = String::new();
21/// for c in decoder {
22///     string.push(c?);
23/// }
24/// # Ok(())
25/// # }
26/// ```
27pub struct TryDecoder<R> {
28    bytes: R,
29    offset: usize,
30}
31
32impl<R> TryDecoder<R> {
33    /// Creates a new `Decoder` iterator from the given [`Result<u8>`](std::io::Result) source
34    /// iterator.
35    pub fn new(source: R) -> TryDecoder<R> {
36        TryDecoder {
37            bytes: source,
38            offset: 0,
39        }
40    }
41}
42
43impl<R, E> Iterator for TryDecoder<R>
44where
45    R: Iterator<Item = Result<u8, E>>,
46    E: From<Utf8Error>,
47{
48    type Item = Result<char, E>;
49
50    fn next(&mut self) -> Option<Result<char, E>> {
51        Some(
52            try_decode_iter_char(self.offset, &mut self.bytes)
53                .transpose()?
54                .map(|(c, len)| {
55                    self.offset += len as usize;
56                    c
57                }),
58        )
59    }
60}
61
62/// Read the next Unicode character out of the given fallible byte iterator.
63pub fn try_decode_iter_char<E>(
64    offset: usize,
65    iter: &mut impl Iterator<Item = Result<u8, E>>,
66) -> Result<Option<(char, u8)>, E>
67where
68    E: From<Utf8Error>,
69{
70    match try_decode_iter_codepoint(offset, iter)? {
71        Some((codepoint, len)) => match char::from_u32(codepoint) {
72            Some(c) => Ok(Some((c, len))),
73            None => Err(Utf8Error::new(offset, len as usize).into()),
74        },
75        None => Ok(None),
76    }
77}
78
79/// Read the next Unicode codepoint.
80///
81/// - `offset` is the byte offset of the codepoint in the byte string. This will
82///   be returned in any enventual `Utf8Error`.
83///
84/// Returns the codepoint as a `u32` and its encoded byte length.
85fn try_decode_iter_codepoint<E>(
86    offset: usize,
87    iter: &mut impl Iterator<Item = Result<u8, E>>,
88) -> Result<Option<(u32, u8)>, E>
89where
90    E: From<Utf8Error>,
91{
92    match iter.next() {
93        Some(Ok(a)) => {
94            let a = a as u32;
95            if a & 0x80 == 0x00 {
96                // 1 byte.
97                Ok(Some((a, 1)))
98            } else if a & 0xE0 == 0xC0 {
99                // 2 bytes.
100                let b = try_next_iter_byte(iter, offset, 1)?;
101                Ok(Some(((a & 0x1F) << 6 | b, 2)))
102            } else if a & 0xF0 == 0xE0 {
103                // 3 bytes.
104                let b = try_next_iter_byte(iter, offset, 1)?;
105                let c = try_next_iter_byte(iter, offset, 2)?;
106                Ok(Some(((a & 0x0F) << 12 | b << 6 | c, 3)))
107            } else if a & 0xF8 == 0xF0 {
108                // 4 bytes.
109                let b = try_next_iter_byte(iter, offset, 1)?;
110                let c = try_next_iter_byte(iter, offset, 2)?;
111                let d = try_next_iter_byte(iter, offset, 3)?;
112                Ok(Some(((a & 0x07) << 18 | b << 12 | c << 6 | d, 4)))
113            } else {
114                Err(Utf8Error::new(offset, 1).into())
115            }
116        }
117        Some(Err(e)) => Err(e),
118        None => Ok(None),
119    }
120}
121
122/// Read the next byte of the UTF-8 character out of the given byte iterator.
123///
124/// - `offset` is the byte offset of the current codepoint.
125/// - `len` is the number of parsed bytes of the current codepoint (excluding
126///   this one).
127///
128/// The byte is returned as a `u32` for later shifting.
129fn try_next_iter_byte<E>(
130    iter: &mut impl Iterator<Item = Result<u8, E>>,
131    offset: usize,
132    len: usize,
133) -> Result<u32, E>
134where
135    E: From<Utf8Error>,
136{
137    match iter.next() {
138        Some(Ok(c)) => {
139            if c & 0xC0 == 0x80 {
140                Ok((c & 0x3F) as u32)
141            } else {
142                Err(Utf8Error::new(offset, len + 1).into())
143            }
144        }
145        Some(Err(e)) => Err(e),
146        None => Err(Utf8Error::new(offset, len).into()),
147    }
148}