use encoding_rs::Encoding;
use once_cell::sync::Lazy;
use std::{fs::File, io::Read as _, path::Path};
static LANG_ENCODING: Lazy<&'static Encoding> = Lazy::new(|| get_lang_encoding().next().unwrap_or(encoding_rs::UTF_8));
#[cfg(feature = "tokio")]
pub async fn a_auto_decode_file<P: AsRef<Path>>(path: P) -> crate::AnyResult<String> {
use tokio::io::AsyncReadExt as _;
let mut file = tokio::fs::File::open(path).await?;
let mut buffer = Vec::new();
file.read_to_end(&mut buffer).await?;
Ok(auto_decode(&buffer).unwrap_or_else(|_| String::from_utf8_lossy(&buffer).to_string()))
}
pub fn auto_decode_file<P: AsRef<Path>>(path: P) -> crate::AnyResult<String> {
let mut file = File::open(path)?;
let mut buffer = Vec::new();
file.read_to_end(&mut buffer)?;
Ok(auto_decode(&buffer).unwrap_or_else(|_| String::from_utf8_lossy(&buffer).to_string()))
}
pub fn auto_decode(input: &[u8]) -> crate::Result<String> {
let utf8 = encoding_rs::UTF_8.name();
if LANG_ENCODING.name() != utf8 {
auto_decode_all(input)
} else {
auto_decode_simple(input)
}
}
pub fn auto_decode_simple(input: &[u8]) -> crate::Result<String> {
if input.is_empty() {
return Ok(String::new());
}
if let Ok(s) = std::str::from_utf8(input) {
if !is_garbled(&s) {
return Ok(s.to_owned());
}
}
let (cow, _, had_errors) = LANG_ENCODING.decode(input);
if !had_errors && !is_garbled(&cow) {
return Ok(cow.trim().to_owned());
}
let (cow, _, had_errors) = encoding_rs::GBK.decode(input);
if !had_errors && !is_garbled(&cow) {
return Ok(cow.trim().to_owned());
}
Err("找不到编码".into())
}
pub fn auto_decode_all(input: &[u8]) -> crate::Result<String> {
if input.is_empty() {
return Ok(String::new());
}
if let Some(encoding) = check_bom(input) {
let (decoded, _, had_errors) = encoding.decode(input);
if !had_errors && !is_garbled(&decoded) {
return Ok(decoded.trim().to_owned());
}
}
if let Ok(s) = std::str::from_utf8(input) {
if !is_garbled(&s) {
return Ok(s.to_owned());
}
}
let (cow, _, had_errors) = LANG_ENCODING.decode(input);
if !had_errors {
return Ok(cow.trim().to_owned());
}
let encodings = [
encoding_rs::GBK, encoding_rs::BIG5, encoding_rs::SHIFT_JIS, encoding_rs::EUC_KR, encoding_rs::WINDOWS_1251, encoding_rs::WINDOWS_1252, encoding_rs::WINDOWS_1256, encoding_rs::WINDOWS_874, encoding_rs::WINDOWS_1258, encoding_rs::WINDOWS_1250, encoding_rs::WINDOWS_1257, encoding_rs::WINDOWS_1254, encoding_rs::WINDOWS_1253, encoding_rs::WINDOWS_1255, encoding_rs::WINDOWS_1256, ];
for &encoding in encodings.iter() {
let (cow, _, had_errors) = encoding.decode(input);
if !had_errors && !is_garbled(&cow) {
return Ok(cow.trim().to_owned());
}
}
Err("找不到编码".into())
}
#[inline(always)]
pub fn check_bom(input: &[u8]) -> Option<&'static Encoding> {
if input.len() < 2 {
return None;
}
match (input[0], input[1]) {
(0xFF, 0xFE) => Some(encoding_rs::UTF_16LE),
(0xFE, 0xFF) => Some(encoding_rs::UTF_16BE),
_ => None,
}
}
#[inline]
pub fn is_utf8(input: &[u8]) -> bool {
std::str::from_utf8(input).is_ok()
}
#[inline]
pub fn is_garbled(s: &str) -> bool {
if s.is_empty() {
return false;
}
let total_chars = s.chars().count();
let special_chars = s
.chars()
.filter(|&c| {
(c as u32 >= 0xE000 && c as u32 <= 0xF8FF) ||
(c.is_control() && !matches!(c, '\n' | '\r' | '\t')) ||
(!c.is_ascii() && !is_valid_unicode(c))
})
.count();
(special_chars as f32 / total_chars as f32) > 0.4
}
#[inline]
fn is_valid_unicode(c: char) -> bool {
c.is_alphabetic() ||
c.is_numeric() ||
c.is_ascii_punctuation() ||
(c as u32 >= 0x4E00 && c as u32 <= 0x9FFF) || (c as u32 >= 0x3040 && c as u32 <= 0x309F) || (c as u32 >= 0x30A0 && c as u32 <= 0x30FF) || (c as u32 >= 0xAC00 && c as u32 <= 0xD7AF) || (c as u32 >= 0x1F300 && c as u32 <= 0x1F9FF) || (c as u32 >= 0x2600 && c as u32 <= 0x26FF) || (c as u32 >= 0x2700 && c as u32 <= 0x27BF) || (c as u32 >= 0x1F000 && c as u32 <= 0x1F02F) || (c as u32 >= 0x1F0A0 && c as u32 <= 0x1F0FF) || (c as u32 >= 0x1F100 && c as u32 <= 0x1F1FF) || (c as u32 >= 0x1F200 && c as u32 <= 0x1F2FF) }
#[inline]
pub fn get_lang() -> impl Iterator<Item = String> {
sys_locale::get_locales()
}
#[inline]
pub fn get_lang_encoding() -> impl Iterator<Item = &'static Encoding> {
get_lang().filter_map(|locale| {
let locale = locale.to_lowercase();
Some(match locale {
l if l.contains("zh-cn") || l.contains("zh-sg") => encoding_rs::GBK,
l if l.contains("zh-tw") || l.contains("zh-hk") => encoding_rs::BIG5,
l if l.contains("ja") => encoding_rs::SHIFT_JIS,
l if l.contains("ko") => encoding_rs::EUC_KR,
l if l.contains("ru") || l.contains("uk") || l.contains("be") => encoding_rs::WINDOWS_1251,
l if l.contains("ar") || l.contains("he") || l.contains("fa") => encoding_rs::WINDOWS_1256,
l if l.contains("th") => encoding_rs::WINDOWS_874,
l if l.contains("vi") => encoding_rs::WINDOWS_1258,
l if l.contains("cs")
|| l.contains("hu")
|| l.contains("pl")
|| l.contains("ro")
|| l.contains("hr")
|| l.contains("sk")
|| l.contains("sl")
|| l.contains("sr") =>
{
encoding_rs::WINDOWS_1250
}
l if l.contains("de")
|| l.contains("fr")
|| l.contains("es")
|| l.contains("it")
|| l.contains("pt")
|| l.contains("nl")
|| l.contains("sv")
|| l.contains("da")
|| l.contains("no")
|| l.contains("fi") =>
{
encoding_rs::WINDOWS_1252
}
l if l.contains("el") => encoding_rs::WINDOWS_1253,
l if l.contains("tr") => encoding_rs::WINDOWS_1254,
l if l.contains("et") || l.contains("lt") || l.contains("lv") => encoding_rs::WINDOWS_1257,
_ => encoding_rs::UTF_8,
})
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_auto_decode_utf8() {
let input = "Hello, world!".as_bytes();
assert_eq!(auto_decode(input).unwrap(), "Hello, world!");
}
#[test]
fn test_auto_decode_utf16le() {
let input = b"\xFF\xFEH\x00e\x00l\x00l\x00o\x00";
assert_eq!(auto_decode(input).unwrap(), "Hello");
}
#[test]
fn test_auto_decode_utf16be() {
let input = b"\xFE\xFF\x00H\x00e\x00l\x00l\x00o";
assert_eq!(auto_decode(input).unwrap(), "Hello");
}
#[test]
fn test_auto_decode_ascii() {
let input = b"Hello, ASCII!";
assert_eq!(auto_decode(input).unwrap(), "Hello, ASCII!");
}
#[test]
fn test_auto_decode_empty_input() {
let input = b"";
assert_eq!(auto_decode(input).unwrap(), "");
}
#[test]
fn test_is_utf8_edge_cases() {
assert!(is_utf8(&[0xF0, 0x90, 0x80, 0x80])); assert!(is_utf8(&[0xF4, 0x8F, 0xBF, 0xBF])); assert!(!is_utf8(&[0xF4, 0x90, 0x80, 0x80])); assert!(!is_utf8(&[0xC0, 0x80])); assert!(!is_utf8(&[0xE0, 0x80, 0x80])); }
#[tokio::test]
async fn test_error_code() -> crate::AnyResult<()> {
tokio::fs::write("target/error_code12.log", encoding_rs::UTF_8.encode("Привет").0).await?;
assert_eq!("Привет".to_string(), a_auto_decode_file("target/error_code12.log").await.unwrap());
tokio::fs::write("target/error_code14.log", encoding_rs::GBK.encode("你好臺灣").0).await?;
assert_eq!("你好臺灣".to_string(), a_auto_decode_file("target/error_code14.log").await.unwrap());
tokio::fs::write("target/error_code11.log", encoding_rs::UTF_8.encode("こんにちは").0).await?;
assert_eq!("こんにちは".to_string(), a_auto_decode_file("target/error_code11.log").await.unwrap());
tokio::fs::write("target/error_code13.log", encoding_rs::UTF_8.encode("안녕하세요").0).await?;
assert_eq!("안녕하세요".to_string(), a_auto_decode_file("target/error_code13.log").await.unwrap());
Ok(())
}
#[test]
fn test_is_garbled() {
assert!(!is_garbled("Hello, 世界!")); assert!(!is_garbled("")); assert!(!is_garbled("こんにちは")); assert!(is_garbled("��������")); assert!(!is_garbled("안녕하세요")); assert!(!is_garbled("Привет мир")); assert!(!is_garbled("مرحبا بالعالم")); assert!(!is_garbled("ยินดีต้อนรับ")); assert!(!is_garbled("Hello世界こんにちは안녕123!@#")); assert!(!is_garbled("📱🌍🎉🎨")); assert!(!is_garbled("表情😊混合🌟测试")); assert!(!is_garbled("\n\r\t")); assert!(!is_garbled("Hello\nWorld\r\n")); assert!(is_garbled("\0\0\0\0\0")); assert!(!is_garbled("Hello\0World")); assert!(is_garbled("\u{E000}\u{E001}\u{E002}\u{E003}\u{E004}")); assert!(!is_garbled(" ")); assert!(!is_garbled("!@#$%^&*()")); assert!(!is_garbled("1234567890")); assert!(!is_garbled("þÿ")); assert!(is_garbled("���������")); assert!(is_garbled("\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}")); }
}