1use core::char;
2use core::fmt::{self, Write};
3use core::mem;
4use core::str as core_str;
5
6static UTF8_CHAR_WIDTH: [u8; 256] = [
8 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
9 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
11 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
13 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
15 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
17 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
19 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
21 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ];
25
26#[inline]
28pub fn utf8_char_width(b: u8) -> usize {
29 UTF8_CHAR_WIDTH[b as usize] as usize
30}
31
32pub struct Utf8Lossy {
34 bytes: [u8],
35}
36
37impl Utf8Lossy {
38 pub fn from_bytes(bytes: &[u8]) -> &Utf8Lossy {
39 unsafe { mem::transmute(bytes) }
41 }
42
43 pub fn chunks(&self) -> Utf8LossyChunksIter<'_> {
44 Utf8LossyChunksIter {
45 source: &self.bytes,
46 }
47 }
48}
49
50#[allow(missing_debug_implementations)]
52pub struct Utf8LossyChunksIter<'a> {
53 source: &'a [u8],
54}
55
56#[derive(PartialEq, Eq, Debug)]
57pub struct Utf8LossyChunk<'a> {
58 pub valid: &'a str,
61 pub broken: &'a [u8],
64}
65
66impl<'a> Iterator for Utf8LossyChunksIter<'a> {
67 type Item = Utf8LossyChunk<'a>;
68
69 fn next(&mut self) -> Option<Utf8LossyChunk<'a>> {
70 if self.source.is_empty() {
71 return None;
72 }
73
74 const TAG_CONT_U8: u8 = 128;
75 fn safe_get(xs: &[u8], i: usize) -> u8 {
76 *xs.get(i).unwrap_or(&0)
77 }
78
79 let mut i = 0;
80 while i < self.source.len() {
81 let i_ = i;
82
83 let byte = unsafe { *self.source.get_unchecked(i) };
86 i += 1;
87
88 if byte < 128 {
89 } else {
90 let w = utf8_char_width(byte);
91
92 macro_rules! error {
93 () => {{
94 unsafe {
96 let r = Utf8LossyChunk {
97 valid: core_str::from_utf8_unchecked(&self.source[0..i_]),
98 broken: &self.source[i_..i],
99 };
100 self.source = &self.source[i..];
101 return Some(r);
102 }
103 }};
104 }
105
106 match w {
107 2 => {
108 if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
109 error!();
110 }
111 i += 1;
112 }
113 3 => {
114 match (byte, safe_get(self.source, i)) {
115 (0xE0, 0xA0..=0xBF) => (),
116 (0xE1..=0xEC, 0x80..=0xBF) => (),
117 (0xED, 0x80..=0x9F) => (),
118 (0xEE..=0xEF, 0x80..=0xBF) => (),
119 _ => {
120 error!();
121 }
122 }
123 i += 1;
124 if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
125 error!();
126 }
127 i += 1;
128 }
129 4 => {
130 match (byte, safe_get(self.source, i)) {
131 (0xF0, 0x90..=0xBF) => (),
132 (0xF1..=0xF3, 0x80..=0xBF) => (),
133 (0xF4, 0x80..=0x8F) => (),
134 _ => {
135 error!();
136 }
137 }
138 i += 1;
139 if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
140 error!();
141 }
142 i += 1;
143 if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
144 error!();
145 }
146 i += 1;
147 }
148 _ => {
149 error!();
150 }
151 }
152 }
153 }
154
155 let r = Utf8LossyChunk {
156 valid: unsafe { core_str::from_utf8_unchecked(self.source) },
158 broken: &[],
159 };
160 self.source = &[];
161 Some(r)
162 }
163}
164
165impl fmt::Display for Utf8Lossy {
166 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
167 if self.bytes.is_empty() {
170 return "".fmt(f);
171 }
172
173 for Utf8LossyChunk { valid, broken } in self.chunks() {
174 if valid.len() == self.bytes.len() {
178 assert!(broken.is_empty());
179 return valid.fmt(f);
180 }
181
182 f.write_str(valid)?;
183 if !broken.is_empty() {
184 f.write_char(char::REPLACEMENT_CHARACTER)?;
185 }
186 }
187 Ok(())
188 }
189}
190
191impl fmt::Debug for Utf8Lossy {
192 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
193 f.write_char('"')?;
194
195 for Utf8LossyChunk { valid, broken } in self.chunks() {
196 {
199 let mut from = 0;
200 for (i, c) in valid.char_indices() {
201 let esc = c.escape_debug();
202 if esc.len() != 1 {
204 f.write_str(&valid[from..i])?;
205 for c in esc {
206 f.write_char(c)?;
207 }
208 from = i + c.len_utf8();
209 }
210 }
211 f.write_str(&valid[from..])?;
212 }
213
214 for &b in broken {
216 write!(f, "\\x{:02x}", b)?;
217 }
218 }
219
220 f.write_char('"')
221 }
222}