1use encoding_rs::Encoding;
2use once_cell::sync::Lazy;
3use std::{fs::File, io::Read as _, path::Path};
4
5static LANG_ENCODING: Lazy<&'static Encoding> = Lazy::new(|| get_lang_encoding().next().unwrap_or(encoding_rs::UTF_8));
7
8#[cfg(feature = "tokio")]
21pub async fn a_auto_decode_file<P: AsRef<Path>>(path: P) -> crate::AnyResult<String> {
22 use tokio::io::AsyncReadExt as _;
23 let mut file = tokio::fs::File::open(path).await?;
24 let mut buffer = Vec::new();
25 file.read_to_end(&mut buffer).await?;
26
27 Ok(auto_decode(&buffer).unwrap_or_else(|_| String::from_utf8_lossy(&buffer).to_string()))
29}
30
31pub fn auto_decode_file<P: AsRef<Path>>(path: P) -> crate::AnyResult<String> {
44 let mut file = File::open(path)?;
46 let mut buffer = Vec::new();
47 file.read_to_end(&mut buffer)?;
48
49 Ok(auto_decode(&buffer).unwrap_or_else(|_| String::from_utf8_lossy(&buffer).to_string()))
51}
52
53pub fn auto_decode(input: &[u8]) -> crate::Result<String> {
67 let utf8 = encoding_rs::UTF_8.name();
68 if LANG_ENCODING.name() != utf8 {
69 auto_decode_all(input)
70 } else {
71 auto_decode_simple(input)
72 }
73}
74
75pub fn auto_decode_simple(input: &[u8]) -> crate::Result<String> {
89 if input.is_empty() {
91 return Ok(String::new());
92 }
93
94 if let Ok(s) = std::str::from_utf8(input) {
95 if !is_garbled(&s) {
96 return Ok(s.trim().to_owned());
97 }
98 }
99 let (cow, _, had_errors) = LANG_ENCODING.decode(input);
101 if !had_errors && !is_garbled(&cow) {
102 return Ok(cow.trim().to_owned());
103 }
104 let (cow, _, had_errors) = encoding_rs::GBK.decode(input);
105 if !had_errors && !is_garbled(&cow) {
106 return Ok(cow.trim().to_owned());
107 }
108 Err("找不到编码".into())
109}
110pub fn auto_decode_all(input: &[u8]) -> crate::Result<String> {
124 if input.is_empty() {
126 return Ok(String::new());
127 }
128
129 if let Some(encoding) = check_bom(input) {
131 let (decoded, _, had_errors) = encoding.decode(input);
132 if !had_errors && !is_garbled(&decoded) {
133 return Ok(decoded.trim().to_owned());
134 }
135 }
136 if let Ok(s) = std::str::from_utf8(input) {
137 if !is_garbled(&s) {
138 return Ok(s.trim().to_owned());
139 }
140 }
141 let (cow, _, had_errors) = LANG_ENCODING.decode(input);
143 if !had_errors {
144 return Ok(cow.trim().to_owned());
145 }
146
147 let encodings = [
149 encoding_rs::GBK, encoding_rs::BIG5, encoding_rs::SHIFT_JIS, encoding_rs::EUC_KR, encoding_rs::WINDOWS_1251, encoding_rs::WINDOWS_1252, encoding_rs::WINDOWS_1256, encoding_rs::WINDOWS_874, encoding_rs::WINDOWS_1258, encoding_rs::WINDOWS_1250, encoding_rs::WINDOWS_1257, encoding_rs::WINDOWS_1254, encoding_rs::WINDOWS_1253, encoding_rs::WINDOWS_1255, encoding_rs::WINDOWS_1256, ];
165
166 for &encoding in encodings.iter() {
167 let (cow, _, had_errors) = encoding.decode(input);
168 if !had_errors && !is_garbled(&cow) {
169 return Ok(cow.trim().to_owned());
170 }
171 }
172
173 Err("找不到编码".into())
174}
175
176#[inline(always)]
178pub fn check_bom(input: &[u8]) -> Option<&'static Encoding> {
179 if input.len() < 2 {
180 return None;
181 }
182 match (input[0], input[1]) {
183 (0xFF, 0xFE) => Some(encoding_rs::UTF_16LE),
184 (0xFE, 0xFF) => Some(encoding_rs::UTF_16BE),
185 _ => None,
186 }
187}
188
189#[inline]
191pub fn is_utf8(input: &[u8]) -> bool {
192 std::str::from_utf8(input).is_ok()
193}
194#[inline]
196pub fn is_garbled(s: &str) -> bool {
197 if s.is_empty() {
198 return false;
199 }
200
201 let total_chars = s.chars().count();
202 let special_chars = s
203 .chars()
204 .filter(|&c| {
205 (c as u32 >= 0xE000 && c as u32 <= 0xF8FF) ||
207 (c.is_control() && !matches!(c, '\n' | '\r' | '\t')) ||
209 (!c.is_ascii() && !is_valid_unicode(c))
211 })
212 .count();
213
214 (special_chars as f32 / total_chars as f32) > 0.4
215}
216
217
218#[inline]
220fn is_valid_unicode(c: char) -> bool {
221 c.is_alphabetic() ||
223 c.is_numeric() ||
224 c.is_ascii_punctuation() ||
225 (c as u32 >= 0x4E00 && c as u32 <= 0x9FFF) || (c as u32 >= 0x3040 && c as u32 <= 0x309F) || (c as u32 >= 0x30A0 && c as u32 <= 0x30FF) || (c as u32 >= 0xAC00 && c as u32 <= 0xD7AF) || (c as u32 >= 0x1F300 && c as u32 <= 0x1F9FF) || (c as u32 >= 0x2600 && c as u32 <= 0x26FF) || (c as u32 >= 0x2700 && c as u32 <= 0x27BF) || (c as u32 >= 0x1F000 && c as u32 <= 0x1F02F) || (c as u32 >= 0x1F0A0 && c as u32 <= 0x1F0FF) || (c as u32 >= 0x1F100 && c as u32 <= 0x1F1FF) || (c as u32 >= 0x1F200 && c as u32 <= 0x1F2FF) }
239#[inline]
241pub fn get_lang() -> impl Iterator<Item = String> {
242 sys_locale::get_locales()
243}
244#[inline]
246pub fn get_lang_encoding() -> impl Iterator<Item = &'static Encoding> {
247 get_lang().filter_map(|locale| {
248 let locale = locale.to_lowercase();
249 Some(match locale {
250 l if l.contains("zh-cn") || l.contains("zh-sg") => encoding_rs::GBK,
252 l if l.contains("zh-tw") || l.contains("zh-hk") => encoding_rs::BIG5,
253 l if l.contains("ja") => encoding_rs::SHIFT_JIS,
254 l if l.contains("ko") => encoding_rs::EUC_KR,
255
256 l if l.contains("ru") || l.contains("uk") || l.contains("be") => encoding_rs::WINDOWS_1251,
258
259 l if l.contains("ar") || l.contains("he") || l.contains("fa") => encoding_rs::WINDOWS_1256,
261
262 l if l.contains("th") => encoding_rs::WINDOWS_874,
264 l if l.contains("vi") => encoding_rs::WINDOWS_1258,
265
266 l if l.contains("cs")
269 || l.contains("hu")
270 || l.contains("pl")
271 || l.contains("ro")
272 || l.contains("hr")
273 || l.contains("sk")
274 || l.contains("sl")
275 || l.contains("sr") =>
276 {
277 encoding_rs::WINDOWS_1250
278 }
279 l if l.contains("de")
281 || l.contains("fr")
282 || l.contains("es")
283 || l.contains("it")
284 || l.contains("pt")
285 || l.contains("nl")
286 || l.contains("sv")
287 || l.contains("da")
288 || l.contains("no")
289 || l.contains("fi") =>
290 {
291 encoding_rs::WINDOWS_1252
292 }
293 l if l.contains("el") => encoding_rs::WINDOWS_1253,
295 l if l.contains("tr") => encoding_rs::WINDOWS_1254,
297 l if l.contains("et") || l.contains("lt") || l.contains("lv") => encoding_rs::WINDOWS_1257,
299 _ => encoding_rs::UTF_8,
301 })
302 })
303}
304
305#[cfg(test)]
306mod tests {
307 use super::*;
308 #[test]
309 fn test_auto_decode_utf8() {
310 let input = "Hello, world!".as_bytes();
311 assert_eq!(auto_decode(input).unwrap(), "Hello, world!");
312 }
313
314 #[test]
315 fn test_auto_decode_utf16le() {
316 let input = b"\xFF\xFEH\x00e\x00l\x00l\x00o\x00";
317 assert_eq!(auto_decode(input).unwrap(), "Hello");
318 }
319
320 #[test]
321 fn test_auto_decode_utf16be() {
322 let input = b"\xFE\xFF\x00H\x00e\x00l\x00l\x00o";
323 assert_eq!(auto_decode(input).unwrap(), "Hello");
324 }
325
326 #[test]
327 fn test_auto_decode_ascii() {
328 let input = b"Hello, ASCII!";
329 assert_eq!(auto_decode(input).unwrap(), "Hello, ASCII!");
330 }
331
332 #[test]
333 fn test_auto_decode_empty_input() {
334 let input = b"";
335 assert_eq!(auto_decode(input).unwrap(), "");
336 }
337 #[test]
338 fn test_is_utf8_edge_cases() {
339 assert!(is_utf8(&[0xF0, 0x90, 0x80, 0x80])); assert!(is_utf8(&[0xF4, 0x8F, 0xBF, 0xBF])); assert!(!is_utf8(&[0xF4, 0x90, 0x80, 0x80])); assert!(!is_utf8(&[0xC0, 0x80])); assert!(!is_utf8(&[0xE0, 0x80, 0x80])); }
345
346 #[tokio::test]
347 async fn test_error_code() -> crate::AnyResult<()> {
348 tokio::fs::write("target/error_code12.log", encoding_rs::UTF_8.encode("Привет").0).await?;
350 assert_eq!("Привет".to_string(), a_auto_decode_file("target/error_code12.log").await.unwrap());
351 tokio::fs::write("target/error_code14.log", encoding_rs::GBK.encode("你好臺灣").0).await?;
353 assert_eq!("你好臺灣".to_string(), a_auto_decode_file("target/error_code14.log").await.unwrap());
354 tokio::fs::write("target/error_code11.log", encoding_rs::UTF_8.encode("こんにちは").0).await?;
356 assert_eq!("こんにちは".to_string(), a_auto_decode_file("target/error_code11.log").await.unwrap());
357 tokio::fs::write("target/error_code13.log", encoding_rs::UTF_8.encode("안녕하세요").0).await?;
359 assert_eq!("안녕하세요".to_string(), a_auto_decode_file("target/error_code13.log").await.unwrap());
360 Ok(())
361 }
362
363 #[test]
364 fn test_is_garbled() {
365 assert!(!is_garbled("Hello, 世界!")); assert!(!is_garbled("")); assert!(!is_garbled("こんにちは")); assert!(is_garbled("��������")); assert!(!is_garbled("안녕하세요")); assert!(!is_garbled("Привет мир")); assert!(!is_garbled("مرحبا بالعالم")); assert!(!is_garbled("ยินดีต้อนรับ")); assert!(!is_garbled("Hello世界こんにちは안녕123!@#")); assert!(!is_garbled("📱🌍🎉🎨")); assert!(!is_garbled("表情😊混合🌟测试")); assert!(!is_garbled("\n\r\t")); assert!(!is_garbled("Hello\nWorld\r\n")); assert!(is_garbled("\0\0\0\0\0")); assert!(!is_garbled("Hello\0World")); assert!(is_garbled("\u{E000}\u{E001}\u{E002}\u{E003}\u{E004}")); assert!(!is_garbled(" ")); assert!(!is_garbled("!@#$%^&*()")); assert!(!is_garbled("1234567890")); assert!(!is_garbled("þÿ")); assert!(is_garbled("���������")); assert!(is_garbled("\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}")); }
399}