use chardetng::EncodingDetector;
use std::fs::read;
use std::io::Result;
use std::path::Path;
#[bve_derive::span(DEBUG, "Read to UTF-8", filename = ?filename.as_ref())]
pub fn read_convert_utf8(filename: impl AsRef<Path>) -> Result<String> {
let bytes = read(filename)?;
Ok(convert_to_utf8(bytes))
}
#[bve_derive::span(TRACE, "UTF-8 Conversion", size = bytes.len())]
fn convert_to_utf8(bytes: Vec<u8>) -> String {
tracing::trace!("Converting file of {} bytes", bytes.len());
let (encoding, reason) = if bytes.len() >= 2 && bytes[0..2] == [0xFF, 0xFE] {
(encoding_rs::UTF_16LE, "BOM")
} else if bytes.len() >= 2 && bytes[0..2] == [0xFE, 0xFF] {
(encoding_rs::UTF_16BE, "BOM")
} else if bytes.len() >= 3 && bytes[0..3] == [0xEF, 0xBB, 0xBF] {
(encoding_rs::UTF_8, "BOM")
} else {
let mut detector = EncodingDetector::new();
let ascii_only = !detector.feed(&bytes, true);
if ascii_only {
tracing::debug!("UTF-8 chosen due to All ASCII");
return String::from_utf8(bytes).expect("Only ascii characters detected, but utf8 validation failed");
}
(detector.guess(None, true), "chardetng")
};
tracing::debug!("{} chosen due to {}", encoding.name(), reason);
let (result, ..) = encoding.decode_with_bom_removal(&bytes);
tracing::trace!("Converted UTF-8 is {} bytes", result.len());
result.to_string()
}
#[cfg(test)]
mod test {
use super::convert_to_utf8;
#[bve_derive::bve_test]
#[test]
fn bom_removal() {
assert_eq!(convert_to_utf8(vec![0xFF, 0xFE]), "");
assert_eq!(convert_to_utf8(vec![0xFE, 0xFF]), "");
assert_eq!(convert_to_utf8(vec![0xEF, 0xBB, 0xBF]), "");
}
#[bve_derive::bve_test]
#[test]
fn shift_jis() {
assert_eq!(
convert_to_utf8(
b"\x82\xb1\x82\xf1\x82\xc9\x82\xbf\x82\xcd\x81\x41\x8c\xb3\x8b\x43\x82\xc5\x82\xb7\x82\xa9\x81\x48"
.to_vec()
),
"こんにちは、元気ですか?"
);
}
}