unix_str/
lossy.rs

1use core::char;
2use core::fmt::{self, Write};
3use core::mem;
4use core::str as core_str;
5
6// https://tools.ietf.org/html/rfc3629
7static UTF8_CHAR_WIDTH: [u8; 256] = [
8    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
9    1, // 0x1F
10    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
11    1, // 0x3F
12    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
13    1, // 0x5F
14    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
15    1, // 0x7F
16    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
17    0, // 0x9F
18    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
19    0, // 0xBF
20    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
21    2, // 0xDF
22    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xEF
23    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xFF
24];
25
26/// Given a first byte, determines how many bytes are in this UTF-8 character.
27#[inline]
28pub fn utf8_char_width(b: u8) -> usize {
29    UTF8_CHAR_WIDTH[b as usize] as usize
30}
31
32/// Lossy UTF-8 string.
33pub struct Utf8Lossy {
34    bytes: [u8],
35}
36
37impl Utf8Lossy {
38    pub fn from_bytes(bytes: &[u8]) -> &Utf8Lossy {
39        // SAFETY: Both use the same memory layout, and UTF-8 correctness isn't required.
40        unsafe { mem::transmute(bytes) }
41    }
42
43    pub fn chunks(&self) -> Utf8LossyChunksIter<'_> {
44        Utf8LossyChunksIter {
45            source: &self.bytes,
46        }
47    }
48}
49
50/// Iterator over lossy UTF-8 string
51#[allow(missing_debug_implementations)]
52pub struct Utf8LossyChunksIter<'a> {
53    source: &'a [u8],
54}
55
56#[derive(PartialEq, Eq, Debug)]
57pub struct Utf8LossyChunk<'a> {
58    /// Sequence of valid chars.
59    /// Can be empty between broken UTF-8 chars.
60    pub valid: &'a str,
61    /// Single broken char, empty if none.
62    /// Empty iff iterator item is last.
63    pub broken: &'a [u8],
64}
65
66impl<'a> Iterator for Utf8LossyChunksIter<'a> {
67    type Item = Utf8LossyChunk<'a>;
68
69    fn next(&mut self) -> Option<Utf8LossyChunk<'a>> {
70        if self.source.is_empty() {
71            return None;
72        }
73
74        const TAG_CONT_U8: u8 = 128;
75        fn safe_get(xs: &[u8], i: usize) -> u8 {
76            *xs.get(i).unwrap_or(&0)
77        }
78
79        let mut i = 0;
80        while i < self.source.len() {
81            let i_ = i;
82
83            // SAFETY: `i` starts at `0`, is less than `self.source.len()`, and
84            // only increases, so `0 <= i < self.source.len()`.
85            let byte = unsafe { *self.source.get_unchecked(i) };
86            i += 1;
87
88            if byte < 128 {
89            } else {
90                let w = utf8_char_width(byte);
91
92                macro_rules! error {
93                    () => {{
94                        // SAFETY: We have checked up to `i` that source is valid UTF-8.
95                        unsafe {
96                            let r = Utf8LossyChunk {
97                                valid: core_str::from_utf8_unchecked(&self.source[0..i_]),
98                                broken: &self.source[i_..i],
99                            };
100                            self.source = &self.source[i..];
101                            return Some(r);
102                        }
103                    }};
104                }
105
106                match w {
107                    2 => {
108                        if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
109                            error!();
110                        }
111                        i += 1;
112                    }
113                    3 => {
114                        match (byte, safe_get(self.source, i)) {
115                            (0xE0, 0xA0..=0xBF) => (),
116                            (0xE1..=0xEC, 0x80..=0xBF) => (),
117                            (0xED, 0x80..=0x9F) => (),
118                            (0xEE..=0xEF, 0x80..=0xBF) => (),
119                            _ => {
120                                error!();
121                            }
122                        }
123                        i += 1;
124                        if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
125                            error!();
126                        }
127                        i += 1;
128                    }
129                    4 => {
130                        match (byte, safe_get(self.source, i)) {
131                            (0xF0, 0x90..=0xBF) => (),
132                            (0xF1..=0xF3, 0x80..=0xBF) => (),
133                            (0xF4, 0x80..=0x8F) => (),
134                            _ => {
135                                error!();
136                            }
137                        }
138                        i += 1;
139                        if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
140                            error!();
141                        }
142                        i += 1;
143                        if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
144                            error!();
145                        }
146                        i += 1;
147                    }
148                    _ => {
149                        error!();
150                    }
151                }
152            }
153        }
154
155        let r = Utf8LossyChunk {
156            // SAFETY: We have checked that the entire source is valid UTF-8.
157            valid: unsafe { core_str::from_utf8_unchecked(self.source) },
158            broken: &[],
159        };
160        self.source = &[];
161        Some(r)
162    }
163}
164
165impl fmt::Display for Utf8Lossy {
166    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
167        // If we're the empty string then our iterator won't actually yield
168        // anything, so perform the formatting manually
169        if self.bytes.is_empty() {
170            return "".fmt(f);
171        }
172
173        for Utf8LossyChunk { valid, broken } in self.chunks() {
174            // If we successfully decoded the whole chunk as a valid string then
175            // we can return a direct formatting of the string which will also
176            // respect various formatting flags if possible.
177            if valid.len() == self.bytes.len() {
178                assert!(broken.is_empty());
179                return valid.fmt(f);
180            }
181
182            f.write_str(valid)?;
183            if !broken.is_empty() {
184                f.write_char(char::REPLACEMENT_CHARACTER)?;
185            }
186        }
187        Ok(())
188    }
189}
190
191impl fmt::Debug for Utf8Lossy {
192    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
193        f.write_char('"')?;
194
195        for Utf8LossyChunk { valid, broken } in self.chunks() {
196            // Valid part.
197            // Here we partially parse UTF-8 again which is suboptimal.
198            {
199                let mut from = 0;
200                for (i, c) in valid.char_indices() {
201                    let esc = c.escape_debug();
202                    // If char needs escaping, flush backlog so far and write, else skip
203                    if esc.len() != 1 {
204                        f.write_str(&valid[from..i])?;
205                        for c in esc {
206                            f.write_char(c)?;
207                        }
208                        from = i + c.len_utf8();
209                    }
210                }
211                f.write_str(&valid[from..])?;
212            }
213
214            // Broken parts of string as hex escape.
215            for &b in broken {
216                write!(f, "\\x{:02x}", b)?;
217            }
218        }
219
220        f.write_char('"')
221    }
222}