pub fn decode_utf8_or_latin1(bytes: &[u8]) -> String {
match std::str::from_utf8(bytes) {
Ok(s) => s.to_string(),
Err(_) => {
let mut out = String::new();
let mut first = true;
for mut segment in bytes.split(|b| *b == b'\n') {
if !first {
out.push('\n');
}
first = false;
if segment.ends_with(b"\r") {
segment = &segment[..segment.len() - 1];
}
out.push_str(&decode_utf8_or_latin1_line(segment));
}
out
}
}
}
pub fn decode_utf8_or_latin1_line(line: &[u8]) -> String {
match std::str::from_utf8(line) {
Ok(s) => s.to_string(),
Err(_) => line
.iter()
.map(|&b| char::from_u32(u32::from(b)).unwrap())
.collect(),
}
}
pub fn decode_utf8_or_latin1_read_until(raw: &[u8]) -> String {
match std::str::from_utf8(raw) {
Ok(s) => s.to_string(),
Err(_) => raw
.iter()
.map(|&b| char::from_u32(u32::from(b)).unwrap())
.collect(),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn decode_utf8_or_latin1_accepts_valid_utf8_whole_file() {
let s = decode_utf8_or_latin1("café".as_bytes());
assert_eq!(s, "café");
}
#[test]
fn decode_utf8_or_latin1_line_maps_octets() {
assert_eq!(
decode_utf8_or_latin1_line(&[0xff, 0xfe]),
"\u{00ff}\u{00fe}"
);
}
#[test]
fn decode_utf8_or_latin1_read_until_falls_back_per_byte_when_not_utf8() {
assert_eq!(
decode_utf8_or_latin1_read_until(&[b'a', 0xff, b'\n']),
"a\u{00ff}\n"
);
}
#[test]
fn decode_utf8_or_latin1_multiline_latin1_only_on_later_lines() {
let mut v = b"ascii\n".to_vec();
v.push(0xfe);
v.push(b'\n');
assert_eq!(decode_utf8_or_latin1(&v), "ascii\n\u{00fe}\n");
}
}