use std::io;
use std::path::Path;
#[must_use]
pub fn strip_utf8_bom(bytes: &[u8]) -> &[u8] {
if bytes.len() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF {
&bytes[3..]
} else {
bytes
}
}
pub fn decode_host_text_bytes(bytes: &[u8]) -> Result<String, io::Error> {
use io::ErrorKind::InvalidData;
if bytes.starts_with(&[0xFF, 0xFE]) {
let rest = &bytes[2..];
if !rest.len().is_multiple_of(2) {
return Err(io::Error::new(
InvalidData,
"invalid UTF-16LE: odd byte length after BOM",
));
}
let u16s: Vec<u16> = rest
.chunks_exact(2)
.map(|c| u16::from_le_bytes([c[0], c[1]]))
.collect();
return String::from_utf16(&u16s).map_err(|e| io::Error::new(InvalidData, e));
}
if bytes.starts_with(&[0xFE, 0xFF]) {
let rest = &bytes[2..];
if !rest.len().is_multiple_of(2) {
return Err(io::Error::new(
InvalidData,
"invalid UTF-16BE: odd byte length after BOM",
));
}
let u16s: Vec<u16> = rest
.chunks_exact(2)
.map(|c| u16::from_be_bytes([c[0], c[1]]))
.collect();
return String::from_utf16(&u16s).map_err(|e| io::Error::new(InvalidData, e));
}
let b = strip_utf8_bom(bytes);
String::from_utf8(b.to_vec()).map_err(|e| io::Error::new(InvalidData, e))
}
pub fn read_host_text(path: &Path) -> io::Result<String> {
let bytes = std::fs::read(path)?;
decode_host_text_bytes(&bytes)
}
#[must_use]
pub fn script_text_from_vfs_bytes(bytes: &[u8]) -> Option<String> {
decode_host_text_bytes(bytes).ok()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn utf8_plain() {
assert_eq!(decode_host_text_bytes(b"hello").unwrap(), "hello");
}
#[test]
fn utf8_bom_stripped() {
let mut v = vec![0xEF, 0xBB, 0xBF];
v.extend_from_slice(b"echo ok");
assert_eq!(decode_host_text_bytes(&v).unwrap(), "echo ok");
}
#[test]
fn utf16le_bom_hello() {
let bytes: Vec<u8> = vec![0xFF, 0xFE, 0x48, 0x00, 0x69, 0x00];
assert_eq!(decode_host_text_bytes(&bytes).unwrap(), "Hi");
}
#[test]
fn utf16be_bom_hi() {
let bytes: Vec<u8> = vec![0xFE, 0xFF, 0x00, 0x48, 0x00, 0x69];
assert_eq!(decode_host_text_bytes(&bytes).unwrap(), "Hi");
}
#[test]
fn strip_utf8_bom_only() {
assert_eq!(strip_utf8_bom(b"a"), b"a");
assert_eq!(strip_utf8_bom(&[0xEF, 0xBB, 0xBF]), b"");
}
}