const PUA_BASE: u32 = 0xE000;
pub fn bytes_to_str(bytes: &[u8]) -> String {
let mut result = String::with_capacity(bytes.len());
let mut i = 0;
while i < bytes.len() {
match std::str::from_utf8(&bytes[i..]) {
Ok(s) => {
result.push_str(s);
break;
}
Err(e) => {
let valid_up_to = e.valid_up_to();
if valid_up_to > 0 {
result.push_str(unsafe {
std::str::from_utf8_unchecked(&bytes[i..i + valid_up_to])
});
}
i += valid_up_to;
let b = bytes[i] as u32;
result.push(char::from_u32(PUA_BASE + b).unwrap());
i += 1;
}
}
}
result
}
pub fn str_to_bytes(s: &str) -> Vec<u8> {
let mut result = Vec::with_capacity(s.len());
for c in s.chars() {
let cp = c as u32;
if (PUA_BASE + 0x80..=PUA_BASE + 0xFF).contains(&cp) {
result.push((cp - PUA_BASE) as u8);
} else {
let mut buf = [0u8; 4];
let encoded = c.encode_utf8(&mut buf);
result.extend_from_slice(encoded.as_bytes());
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn roundtrip_ascii() {
let input = b"hello world";
let s = bytes_to_str(input);
assert_eq!(s, "hello world");
assert_eq!(str_to_bytes(&s), input);
}
#[test]
fn roundtrip_utf8() {
let input = "héllo wörld".as_bytes();
let s = bytes_to_str(input);
assert_eq!(s, "héllo wörld");
assert_eq!(str_to_bytes(&s), input);
}
#[test]
fn roundtrip_invalid_byte() {
let input = &[0x5B, 0xA3]; let s = bytes_to_str(input);
assert_eq!(s.len(), 4); let output = str_to_bytes(&s);
assert_eq!(output, input);
}
#[test]
fn roundtrip_mixed() {
let input = &[b'a', 0x80, b'b', 0xFF, b'c'];
let s = bytes_to_str(input);
let output = str_to_bytes(&s);
assert_eq!(output, input);
}
#[test]
fn pure_utf8_passthrough() {
let s = "\u{E000}hello"; let bytes = str_to_bytes(s);
let roundtrip = bytes_to_str(&bytes);
assert_eq!(roundtrip, s);
}
}