use std::borrow::Cow;
pub(crate) fn normalize_encoding(content: &[u8], verbose: bool) -> Cow<'_, [u8]> {
if let Some(rest) = content.strip_prefix(b"\xEF\xBB\xBF") {
return Cow::Borrowed(rest);
}
if let Some(rest) = content.strip_prefix(b"\xFF\xFE") {
return Cow::Owned(decode_utf16(rest, u16::from_le_bytes, verbose));
}
if let Some(rest) = content.strip_prefix(b"\xFE\xFF") {
return Cow::Owned(decode_utf16(rest, u16::from_be_bytes, verbose));
}
Cow::Borrowed(content)
}
fn decode_utf16(bytes: &[u8], from_bytes: fn([u8; 2]) -> u16, verbose: bool) -> Vec<u8> {
let chunks = bytes.chunks_exact(2);
if verbose && !chunks.remainder().is_empty() {
eprintln!(
"syntext: warning: UTF-16 file has odd byte count ({} bytes after BOM); trailing byte dropped",
bytes.len()
);
}
char::decode_utf16(chunks.map(|c| from_bytes([c[0], c[1]])))
.map(|r| r.unwrap_or('\u{FFFD}'))
.collect::<String>()
.into_bytes()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn no_bom_returns_borrowed() {
let content = b"fn main() {}";
let result = normalize_encoding(content, false);
assert!(
matches!(result, Cow::Borrowed(_)),
"plain UTF-8 must return Cow::Borrowed (zero copy)"
);
assert_eq!(result.as_ref(), content);
}
#[test]
fn utf8_bom_stripped() {
let input = b"\xEF\xBB\xBFfn main() {}";
let result = normalize_encoding(input, false);
assert_eq!(result.as_ref(), b"fn main() {}");
}
#[test]
fn utf8_bom_only_file() {
let result = normalize_encoding(b"\xEF\xBB\xBF", false);
assert_eq!(result.as_ref(), b"");
}
#[test]
fn utf16_le_ascii_transcoded() {
let input: &[u8] = b"\xFF\xFEh\x00i\x00\n\x00";
let result = normalize_encoding(input, false);
assert_eq!(result.as_ref(), b"hi\n");
}
#[test]
fn utf16_be_ascii_transcoded() {
let input: &[u8] = b"\xFE\xFF\x00h\x00i\x00\n";
let result = normalize_encoding(input, false);
assert_eq!(result.as_ref(), b"hi\n");
}
#[test]
fn utf16_le_non_bmp_replacement_char() {
let input: &[u8] = b"\xFF\xFE\x00\xD8"; let result = normalize_encoding(input, false);
assert_eq!(result.as_ref(), "\u{FFFD}".as_bytes());
}
#[test]
fn utf16_le_odd_byte_trailing_truncated() {
let input: &[u8] = b"\xFF\xFEh\x00i"; let result = normalize_encoding(input, false);
assert_eq!(result.as_ref(), b"h");
}
#[test]
fn empty_content_returns_borrowed() {
let result = normalize_encoding(b"", false);
assert!(matches!(result, Cow::Borrowed(_)));
assert_eq!(result.as_ref(), b"");
}
#[test]
fn utf16_le_source_code() {
let src = "fn main() {}";
let utf16le: Vec<u8> = src.encode_utf16().flat_map(|u| u.to_le_bytes()).collect();
let mut input = vec![0xFF, 0xFE]; input.extend_from_slice(&utf16le);
let result = normalize_encoding(&input, false);
assert_eq!(result.as_ref(), src.as_bytes());
}
#[test]
fn utf16_le_odd_byte_verbose_warning() {
let input: &[u8] = b"\xFF\xFEh\x00i"; let result = normalize_encoding(input, true);
assert_eq!(result.as_ref(), b"h");
}
}