use encoding_rs::Encoding;
use once_cell::sync::Lazy;
use std::{env, fs::File, io::Read as _, path::Path};
static LANG_ENCODING: Lazy<Option<&'static Encoding>> = Lazy::new(get_lang_encoding);
#[cfg(feature = "tokio")]
pub async fn a_auto_decode_file<P: AsRef<Path>>(path: P) -> crate::AnyResult<String> {
use tokio::io::AsyncReadExt as _;
let mut file = tokio::fs::File::open(path).await?;
let mut buffer = Vec::new();
file.read_to_end(&mut buffer).await?;
Ok(auto_decode(&buffer)?)
}
pub fn auto_decode_file<P: AsRef<Path>>(path: P) -> crate::AnyResult<String> {
let mut file = File::open(path)?;
let mut buffer = Vec::new();
file.read_to_end(&mut buffer)?;
Ok(auto_decode(&buffer)?)
}
pub fn auto_decode(input: &[u8]) -> crate::Result<String> {
let encoding = detect_encoding(input);
let (cow, encoding_used, had_errors) = encoding.decode(input);
if had_errors {
return Err(crate::Error::Decode(format!(
"Decoding error with {:?}",
encoding_used
)));
}
let decoded = cow.into_owned();
if is_garbled(&decoded) {
return Err(crate::Error::Decode(format!(
"Garbled text detected with {:?}",
encoding_used
)));
}
Ok(decoded.trim().to_owned())
}
fn detect_encoding(input: &[u8]) -> &'static Encoding {
const MAX_CHECK_LEN: usize = 1024;
let check_input = &input[..input.len().min(MAX_CHECK_LEN)];
if check_input.is_empty() {
return encoding_rs::UTF_8;
}
match check_input {
[0xEF, 0xBB, 0xBF, ..] => return encoding_rs::UTF_8,
[0xFF, 0xFE, ..] => return encoding_rs::UTF_16LE,
[0xFE, 0xFF, ..] => return encoding_rs::UTF_16BE,
_ => {}
}
if is_utf8(check_input) {
return encoding_rs::UTF_8;
}
if let Some(encoding) = *LANG_ENCODING {
if !encoding.decode(check_input).2 {
return encoding;
}
}
let encodings = [encoding_rs::GBK];
let detected = encodings
.iter()
.find(|&&encoding| !encoding.decode(check_input).2)
.copied()
.unwrap_or(encoding_rs::UTF_8);
detected
}
#[inline]
fn is_utf8(input: &[u8]) -> bool {
std::str::from_utf8(input).is_ok()
}
fn is_garbled(s: &str) -> bool {
if s.is_empty() {
return false;
}
for &b in s.as_bytes() {
if b < 0x20 && b != 0x09 && b != 0x0A && b != 0x0D || b == 0x7F {
return true;
}
}
for c in s.chars() {
if char::from_u32(c as u32).is_none() {
return true;
}
}
false
}
fn get_lang_encoding() -> Option<&'static Encoding> {
env::var("LANG").ok().and_then(|lang| {
let lang = lang.to_lowercase();
match lang {
l if l.contains("zh_cn") || l.contains("zh_sg") => Some(encoding_rs::GBK),
l if l.contains("zh_tw") || l.contains("zh_hk") => Some(encoding_rs::BIG5),
l if l.contains("ja") => Some(encoding_rs::SHIFT_JIS),
l if l.contains("ko") => Some(encoding_rs::EUC_KR),
l if l.starts_with("ru") => Some(encoding_rs::WINDOWS_1251),
_ => None,
}
})
}
#[cfg(test)]
mod tests {
use tempdir::TempDir;
use super::*;
use std::{env, io::Write as _};
#[test]
fn test_auto_decode_utf8() {
let input = "Hello, world!".as_bytes();
assert_eq!(auto_decode(input).unwrap(), "Hello, world!");
}
#[test]
fn test_auto_decode_utf8_with_bom() {
let input = b"\xEF\xBB\xBFHello, world!";
assert_eq!(auto_decode(input).unwrap(), "Hello, world!");
}
#[test]
fn test_auto_decode_utf16le() {
let input = b"\xFF\xFEH\x00e\x00l\x00l\x00o\x00";
assert_eq!(auto_decode(input).unwrap(), "Hello");
}
#[test]
fn test_auto_decode_utf16be() {
let input = b"\xFE\xFF\x00H\x00e\x00l\x00l\x00o";
assert_eq!(auto_decode(input).unwrap(), "Hello");
}
#[test]
fn test_auto_decode_ascii() {
let input = b"Hello, ASCII!";
assert_eq!(auto_decode(input).unwrap(), "Hello, ASCII!");
}
#[test]
fn test_auto_decode_empty_input() {
let input = b"";
assert_eq!(auto_decode(input).unwrap(), "");
}
#[test]
fn test_auto_decode_invalid_utf8() {
let input = b"\xFF\xFE\xFF\xFE"; assert!(auto_decode(input).is_ok()); }
#[test]
fn test_auto_decode_gbk() {
let input = b"\xC4\xE3\xBA\xC3"; env::set_var("LANG", "zh_CN.UTF-8");
assert_eq!(auto_decode(input).unwrap(), "你好");
}
#[test]
fn test_auto_decode_mixed_encoding() {
let input = b"Hello, \xC4\xE3\xBA\xC3"; env::set_var("LANG", "zh_CN.UTF-8");
assert_eq!(auto_decode(input).unwrap(), "Hello, 你好");
}
#[test]
fn test_detect_encoding_long_input() {
let long_input = "A".repeat(2000).into_bytes();
assert_eq!(detect_encoding(&long_input), encoding_rs::UTF_8);
}
#[test]
fn test_detect_encoding_with_non_bmp_characters() {
let input = "😀🌍🚀".as_bytes(); assert_eq!(detect_encoding(input), encoding_rs::UTF_8);
}
#[test]
fn test_is_utf8() {
assert!(is_utf8("Hello, UTF-8!".as_bytes()));
assert!(is_utf8("你好,UTF-8!".as_bytes()));
assert!(is_utf8("🌍🌎🌏".as_bytes()));
assert!(!is_utf8(b"\xFF\xFE\x00\x00"));
}
#[test]
fn test_is_utf8_edge_cases() {
assert!(is_utf8(&[0xF0, 0x90, 0x80, 0x80])); assert!(is_utf8(&[0xF4, 0x8F, 0xBF, 0xBF])); assert!(!is_utf8(&[0xF4, 0x90, 0x80, 0x80])); assert!(!is_utf8(&[0xC0, 0x80])); assert!(!is_utf8(&[0xE0, 0x80, 0x80])); }
#[test]
fn test_get_lang_encoding() {
env::set_var("LANG", "zh_CN.UTF-8");
assert_eq!(get_lang_encoding(), Some(encoding_rs::GBK));
env::set_var("LANG", "ja_JP.UTF-8");
assert_eq!(get_lang_encoding(), Some(encoding_rs::SHIFT_JIS));
env::set_var("LANG", "en_US.UTF-8");
assert_eq!(get_lang_encoding(), None);
}
#[test]
fn test_get_lang_encoding_case_insensitive() {
env::set_var("LANG", "ZH_CN.UTF-8");
assert_eq!(get_lang_encoding(), Some(encoding_rs::GBK));
env::set_var("LANG", "JA_jp.utf-8");
assert_eq!(get_lang_encoding(), Some(encoding_rs::SHIFT_JIS));
}
#[test]
fn test_get_lang_encoding_invalid_lang() {
env::set_var("LANG", "invalid.UTF-8");
assert_eq!(get_lang_encoding(), None);
}
#[test]
fn test_not_is_garbled() {
assert!(
!is_garbled("Hello, world!"),
"ASCII text should not be garbled"
);
assert!(
!is_garbled("你好,世界!"),
"Chinese text should not be garbled"
);
assert!(
!is_garbled("こんにちは、世界!"),
"Japanese text should not be garbled"
);
assert!(
!is_garbled("안녕하세요, 세계!"),
"Korean text should not be garbled"
);
assert!(
!is_garbled("Здравствуй, мир!"),
"Russian text should not be garbled"
);
assert!(
!is_garbled("Hello, !"),
"Text with replacement characters should be garbled"
);
}
#[test]
fn test_is_garbled() {
assert!(
is_garbled("\x01\x02\x03\x04\x05"),
"Text with control characters should be garbled"
);
assert!(!is_garbled(" "), "Whitespace should not be garbled");
assert!(!is_garbled(""), "Empty string should not be garbled");
}
fn create_test_file(dir: &TempDir, filename: &str, content: &[u8]) -> std::path::PathBuf {
let file_path = dir.path().join(filename);
let mut file = File::create(&file_path).unwrap();
file.write_all(content).unwrap();
file_path
}
#[test]
fn test_auto_decode_file() {
let temp_dir = TempDir::new("test_auto_decode_file").unwrap();
let utf8_path = create_test_file(&temp_dir, "utf8.txt", "你好,世界!".as_bytes());
assert_eq!(auto_decode_file(utf8_path).unwrap(), "你好,世界!");
let gbk_path = create_test_file(
&temp_dir,
"gbk.txt",
b"\xC4\xE3\xBA\xC3\xA3\xAC\xCA\xC0\xBD\xE7\xA3\xA1",
);
assert_eq!(auto_decode_file(gbk_path).unwrap(), "你好,世界!");
let empty_path = create_test_file(&temp_dir, "empty.txt", b"");
assert_eq!(auto_decode_file(empty_path).unwrap(), "");
let result = auto_decode_file("non_existent_file.txt");
assert!(result.is_err());
}
#[cfg(feature = "tokio")]
#[tokio::test]
async fn test_a_auto_decode_file() {
let temp_dir = TempDir::new("test_a_auto_decode_file").unwrap();
let utf8_path = create_test_file(&temp_dir, "utf8_async.txt", "你好,世界!".as_bytes());
assert_eq!(a_auto_decode_file(utf8_path).await.unwrap(), "你好,世界!");
let gbk_path = create_test_file(
&temp_dir,
"gbk_async.txt",
b"\xC4\xE3\xBA\xC3\xA3\xAC\xCA\xC0\xBD\xE7\xA3\xA1",
);
assert_eq!(a_auto_decode_file(gbk_path).await.unwrap(), "你好,世界!");
let empty_path = create_test_file(&temp_dir, "empty_async.txt", b"");
assert_eq!(a_auto_decode_file(empty_path).await.unwrap(), "");
let result = a_auto_decode_file("non_existent_file_async.txt").await;
assert!(result.is_err());
}
}