1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
use std::fmt::Display;
use std::ops::Range;
use std::string::FromUtf8Error;
/// A [`std::string::FromUtf8Error`] formatted with the text decoded in a best-effort manner.
pub(crate) struct FromUtf8ErrorContext<'inner> {
inner: &'inner FromUtf8Error,
max_size: usize,
}
impl<'a> FromUtf8ErrorContext<'a> {
pub(crate) fn new(inner: &'a FromUtf8Error, max_size: usize) -> Self {
Self { inner, max_size }
}
/// Get a 'window' of bytes to display in the error message.
///
/// This is a range of (at most) `max_size` that the input can be sliced on to display the
/// portion of input around the encoding error.
fn window(&self) -> Range<usize> {
let bytes = self.inner.as_bytes();
let mut range = self.window_unadjusted();
if range.start != 0 && !is_codepoint_boundary(bytes[range.start]) {
// Note: I think this will always be adjusted up because the input up to the error
// index is valid UTF-8, and the lower bound is always before the error index.
range.start = self
.adjust_index_up(range.start)
.or_else(|| self.adjust_index_down(range.start))
.unwrap_or(range.start);
}
if range.end != bytes.len() && !is_codepoint_boundary(bytes[range.end]) {
range.end = self
.adjust_index_down(range.end)
.or_else(|| self.adjust_index_up(range.end))
.unwrap_or(range.end);
}
range
}
/// Get an unadjusted 'window' of bytes to display in the error message.
///
/// The indexes in this range have not been checked to make sure they lie on UTF-8 boundaries.
fn window_unadjusted(&self) -> Range<usize> {
let bytes = self.inner.as_bytes();
if bytes.len() <= self.max_size {
return 0..bytes.len();
}
// Half the length of the window.
let half_window = self.max_size / 2;
let error_index = self.inner.utf8_error().valid_up_to();
let upper_bound = error_index + half_window;
if upper_bound >= bytes.len() {
// The natural window centered on `error_index` extends past the end of the input. Use
// the end of the input as the right endpoint.
return bytes.len() - self.max_size..bytes.len();
}
let lower_bound = error_index.checked_sub(half_window);
if lower_bound.is_none() {
// The natural window centered on `error_index` extends before the start of the input. Use
// the start of the input as the left endpoint.
return 0..self.max_size;
}
// The natural window is contained entirely within the input.
error_index - half_window..error_index + half_window
}
/// Adjust the given index so that it lies on a UTF-8 boundary in the input, if possible.
///
/// This is done by adjusting the index up to 3 bytes downwards.
fn adjust_index_down(&self, index: usize) -> Option<usize> {
// Logic adapted from unstable `std` method:
// https://github.com/rust-lang/rust/blob/a7e4de13c1785819f4d61da41f6704ed69d5f203/library/core/src/str/mod.rs#L264-L276
let bytes = self.inner.as_bytes();
let lower_bound = index.saturating_sub(3);
bytes[lower_bound..=index]
.iter()
.rposition(|&b| is_codepoint_boundary(b))
.map(|i| i + lower_bound)
}
/// Adjust the given index so that it lies on a UTF-8 boundary in the input, if possible.
///
/// This is done by adjusting the index up to 3 bytes upwards.
fn adjust_index_up(&self, index: usize) -> Option<usize> {
// Logic adapted from unstable `std` method:
// https://github.com/rust-lang/rust/blob/a7e4de13c1785819f4d61da41f6704ed69d5f203/library/core/src/str/mod.rs#L302-L311
let bytes = self.inner.as_bytes();
let upper_bound = Ord::min(index + 4, bytes.len());
bytes[index..upper_bound]
.iter()
.position(|&b| is_codepoint_boundary(b))
.map(|i| i + index)
}
}
impl<'a> Display for FromUtf8ErrorContext<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let bytes = self.inner.as_bytes();
if bytes.len() <= self.max_size {
write!(f, "{:?}", String::from_utf8_lossy(bytes))
} else {
let range = self.window();
let before = range.start;
let after = bytes.len() - range.end;
if before != 0 {
write!(f, "{} ", ByteCount(before))?;
}
// TODO: It might be nice to print the hex values of the bytes like `\x62` instead of
// just `�` U+FFFD REPLACEMENT CHARACTER.
write!(f, "{:?}", String::from_utf8_lossy(&bytes[range]))?;
if after != 0 {
write!(f, " {}", ByteCount(after))?;
}
Ok(())
}
}
}
fn is_codepoint_boundary(byte: u8) -> bool {
// Stolen from a private `std` method:
// https://github.com/rust-lang/rust/blob/a7e4de13c1785819f4d61da41f6704ed69d5f203/library/core/src/num/mod.rs#L1101-L1104
// This is bit magic equivalent to: b < 128 || b >= 192
(byte as i8) >= -0x40
}
struct ByteCount(usize);
impl Display for ByteCount {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
if self.0 == 1 {
write!(f, "[1 byte]")
} else {
write!(f, "[{} bytes]", self.0)
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn err(bytes: &[u8]) -> FromUtf8Error {
String::from_utf8(bytes.to_vec()).unwrap_err()
}
#[test]
fn test_simple() {
assert_eq!(
FromUtf8ErrorContext {
inner: &err(b"puppy\xc0doggy"),
max_size: 32,
}
.to_string(),
"\"puppy�doggy\""
);
}
#[test]
fn test_truncation() {
// Adjusts the lower bound up (3->4) and the upper bound down (35->32).
assert_eq!(
FromUtf8ErrorContext {
inner: &err(b"\xf0\x9f\x98\x8a\
\xf0\x9f\x98\x8a\
\xe2\x9c\x93\
\xf0\x9f\x98\x8a\
\xf0\x9f\x98\x8a\
\xc0\
\xf0\x9f\x98\x8a\
\xf0\x9f\x98\x8a\
\xf0\x9f\x98\x8a\
\xf0\x9f\x98\x8a\
\xf0\x9f\x98\x8a"),
max_size: 32,
}
.to_string(),
"[4 bytes] \"😊✓😊😊�😊😊😊\" [8 bytes]"
);
}
#[test]
fn test_truncation_up() {
// Adjusts the lower bound up (3->4) and the upper bound up (35->37).
assert_eq!(
FromUtf8ErrorContext {
inner: &err(b"\xf0\x9f\x98\x8a\
\xf0\x9f\x98\x8a\
\xe2\x9c\x93\
\xf0\x9f\x98\x8a\
\xf0\x9f\x98\x8a\
\xc0\
\xf0\x9f\x98\x8a\
\xf0\x9f\x98\x8a\
\xf0\x9f\x98\x8a\
\x80\x80\x80\x80\
\x80\x62\x80\x80"),
max_size: 32,
}
.to_string(),
"[4 bytes] \"😊✓😊😊�😊😊😊�����\" [3 bytes]"
);
}
#[test]
fn test_truncation_near_end() {
assert_eq!(
FromUtf8ErrorContext {
inner: &err(b"puppy puppy puppy puppy puppy \
doggy doggy doggy doggy\xc0doggy"),
max_size: 32,
}
.to_string(),
"[27 bytes] \"py doggy doggy doggy doggy�doggy\""
);
assert_eq!(
FromUtf8ErrorContext {
inner: &err(b"puppy puppy puppy puppy puppy \
doggy doggy doggy doggy\xc0"),
max_size: 32,
}
.to_string(),
"[22 bytes] \"y puppy doggy doggy doggy doggy�\""
);
}
#[test]
fn test_truncation_near_start() {
assert_eq!(
FromUtf8ErrorContext {
inner: &err(b"puppy\xc0puppy puppy puppy puppy \
doggy doggy doggy doggy doggy"),
max_size: 32,
}
.to_string(),
"\"puppy�puppy puppy puppy puppy do\" [27 bytes]"
);
assert_eq!(
FromUtf8ErrorContext {
inner: &err(b"\xc0puppy puppy puppy puppy puppy \
doggy doggy doggy doggy"),
max_size: 32,
}
.to_string(),
"\"�puppy puppy puppy puppy puppy d\" [22 bytes]"
);
}
}