pub fn strip_bom(s: &str) -> &str {
s.strip_prefix('\u{feff}').unwrap_or(s)
}
#[derive(Clone, Copy)]
enum Endian {
Little,
Big,
}
fn decode_utf16(bytes: &[u8], endian: Endian) -> String {
let units: Vec<u16> = bytes
.chunks_exact(2)
.map(|c| match endian {
Endian::Little => u16::from_le_bytes([c[0], c[1]]),
Endian::Big => u16::from_be_bytes([c[0], c[1]]),
})
.collect();
String::from_utf16_lossy(&units)
}
fn decode_inner(data: &[u8]) -> String {
if let Some(rest) = data.strip_prefix(&[0xEF, 0xBB, 0xBF]) {
return String::from_utf8_lossy(rest).into_owned();
}
if data.starts_with(&[0xFF, 0xFE]) && !data.starts_with(&[0xFF, 0xFE, 0x00, 0x00]) {
return decode_utf16(&data[2..], Endian::Little);
}
if let Some(rest) = data.strip_prefix(&[0xFE, 0xFF]) {
return decode_utf16(rest, Endian::Big);
}
let nul = data.iter().filter(|&&b| b == 0).count();
if let Ok(s) = std::str::from_utf8(data) {
if nul == 0 || nul <= data.len() / 8 {
return s.to_owned();
}
}
if data.len() >= 2 && nul > data.len() / 8 {
let even = data.iter().step_by(2).filter(|&&b| b == 0).count();
let odd = data.iter().skip(1).step_by(2).filter(|&&b| b == 0).count();
let endian = if even > odd {
Endian::Big
} else {
Endian::Little
};
return decode_utf16(data, endian);
}
String::from_utf8_lossy(data).into_owned()
}
pub fn decode_bytes(data: &[u8]) -> String {
let s = decode_inner(data);
match s.strip_prefix('\u{feff}') {
Some(rest) => rest.to_owned(),
None => s,
}
}
#[cfg(test)]
mod tests {
use super::*;
fn utf16le(s: &str) -> Vec<u8> {
s.encode_utf16().flat_map(u16::to_le_bytes).collect()
}
fn utf16be(s: &str) -> Vec<u8> {
s.encode_utf16().flat_map(u16::to_be_bytes).collect()
}
#[test]
fn strip_bom_removes_only_a_leading_bom() {
assert_eq!(strip_bom("\u{feff}hi"), "hi");
assert_eq!(strip_bom("hi"), "hi");
assert_eq!(strip_bom("hi\u{feff}"), "hi\u{feff}");
}
#[test]
fn plain_utf8_passes_through() {
assert_eq!(decode_bytes(b"$x = 1"), "$x = 1");
}
#[test]
fn utf8_bom_is_stripped() {
let mut data = vec![0xEF, 0xBB, 0xBF];
data.extend_from_slice(b"$x = 1");
assert_eq!(decode_bytes(&data), "$x = 1");
}
#[test]
fn utf16le_with_bom_decodes_without_bom() {
let mut data = vec![0xFF, 0xFE];
data.extend(utf16le("Write-Output 'hi \u{20ac}'"));
let out = decode_bytes(&data);
assert_eq!(out, "Write-Output 'hi \u{20ac}'");
assert!(!out.starts_with('\u{feff}'));
}
#[test]
fn utf16be_with_bom_decodes_without_bom() {
let mut data = vec![0xFE, 0xFF];
data.extend(utf16be("Get-ChildItem"));
assert_eq!(decode_bytes(&data), "Get-ChildItem");
}
#[test]
fn bomless_utf16le_detected_by_nul_pattern() {
let data = utf16le("$path = 'C:\\temp'");
assert_eq!(decode_bytes(&data), "$path = 'C:\\temp'");
}
}