e_utils/system/
encode.rs

1use encoding_rs::Encoding;
2use once_cell::sync::Lazy;
3use std::{fs::File, io::Read as _, path::Path};
4
5// 修改 LANG_ENCODING 的定义
6static LANG_ENCODING: Lazy<&'static Encoding> = Lazy::new(|| get_lang_encoding().next().unwrap_or(encoding_rs::UTF_8));
7
8/// Asynchronously reads a file and automatically decodes its content.
9///
10/// # Example
11///
12/// ```no_run
13/// # use e_utils::system::encode::a_auto_decode_file;
14/// # async fn run() -> Result<(), Box<dyn std::error::Error>> {
15/// let content = a_auto_decode_file("path/to/file.txt").await?;
16/// println!("File content: {}", content);
17/// # Ok(())
18/// # }
19/// ```
20#[cfg(feature = "tokio")]
21pub async fn a_auto_decode_file<P: AsRef<Path>>(path: P) -> crate::AnyResult<String> {
22  use tokio::io::AsyncReadExt as _;
23  let mut file = tokio::fs::File::open(path).await?;
24  let mut buffer = Vec::new();
25  file.read_to_end(&mut buffer).await?;
26
27  // 使用 auto_decode 解码文件内容
28  Ok(auto_decode(&buffer).unwrap_or_else(|_| String::from_utf8_lossy(&buffer).to_string()))
29}
30
31/// Synchronously reads a file and automatically decodes its content.
32///
33/// # Example
34///
35/// ```no_run
36/// # use e_utils::system::encode::auto_decode_file;
37/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
38/// let content = auto_decode_file("path/to/file.txt")?;
39/// println!("File content: {}", content);
40/// # Ok(())
41/// # }
42/// ```
43pub fn auto_decode_file<P: AsRef<Path>>(path: P) -> crate::AnyResult<String> {
44  // 读取文件内容
45  let mut file = File::open(path)?;
46  let mut buffer = Vec::new();
47  file.read_to_end(&mut buffer)?;
48
49  // 使用 auto_decode 解码文件内容
50  Ok(auto_decode(&buffer).unwrap_or_else(|_| String::from_utf8_lossy(&buffer).to_string()))
51}
52
53/// Automatically detects encoding and decodes the input byte sequence.
54///
55/// # Example
56///
57/// ```
58/// # use e_utils::system::encode::auto_decode;
59/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
60/// let bytes = vec![0xE4, 0xBD, 0xA0, 0xE5, 0xA5, 0xBD]; // "你好" in UTF-8
61/// let decoded = auto_decode(&bytes)?;
62/// assert_eq!(decoded, "你好");
63/// # Ok(())
64/// # }
65/// ```
66pub fn auto_decode(input: &[u8]) -> crate::Result<String> {
67  let utf8 = encoding_rs::UTF_8.name();
68  if LANG_ENCODING.name() != utf8 {
69    auto_decode_all(input)
70  } else {
71    auto_decode_simple(input)
72  }
73}
74
75/// Automatically detects encoding and decodes the input byte sequence.
76///
77/// # Example
78///
79/// ```
80/// # use e_utils::system::encode::auto_decode_simple;
81/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
82/// let bytes = vec![0xE4, 0xBD, 0xA0, 0xE5, 0xA5, 0xBD]; // "你好" in UTF-8
83/// let decoded = auto_decode_simple(&bytes)?;
84/// assert_eq!(decoded, "你好");
85/// # Ok(())
86/// # }
87/// ```
88pub fn auto_decode_simple(input: &[u8]) -> crate::Result<String> {
89  // 快速路径:空输入和 UTF-8
90  if input.is_empty() {
91    return Ok(String::new());
92  }
93
94  if let Ok(s) = std::str::from_utf8(input) {
95    if !is_garbled(&s) {
96      return Ok(s.trim().to_owned());
97    }
98  }
99  // 优先尝试系统语言编码
100  let (cow, _, had_errors) = LANG_ENCODING.decode(input);
101  if !had_errors && !is_garbled(&cow) {
102    return Ok(cow.trim().to_owned());
103  }
104  let (cow, _, had_errors) = encoding_rs::GBK.decode(input);
105  if !had_errors && !is_garbled(&cow) {
106    return Ok(cow.trim().to_owned());
107  }
108  Err("找不到编码".into())
109}
110/// Automatically detects encoding and decodes the input byte sequence.
111///
112/// # Example
113///
114/// ```
115/// # use e_utils::system::encode::auto_decode_all;
116/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
117/// let bytes = vec![0xE4, 0xBD, 0xA0, 0xE5, 0xA5, 0xBD]; // "你好" in UTF-8
118/// let decoded = auto_decode_all(&bytes)?;
119/// assert_eq!(decoded, "你好");
120/// # Ok(())
121/// # }
122/// ```
123pub fn auto_decode_all(input: &[u8]) -> crate::Result<String> {
124  // 快速路径:空输入和 UTF-8
125  if input.is_empty() {
126    return Ok(String::new());
127  }
128
129  // BOM 检查
130  if let Some(encoding) = check_bom(input) {
131    let (decoded, _, had_errors) = encoding.decode(input);
132    if !had_errors && !is_garbled(&decoded) {
133      return Ok(decoded.trim().to_owned());
134    }
135  }
136  if let Ok(s) = std::str::from_utf8(input) {
137    if !is_garbled(&s) {
138      return Ok(s.trim().to_owned());
139    }
140  }
141  // 尝试系统语言编码
142  let (cow, _, had_errors) = LANG_ENCODING.decode(input);
143  if !had_errors {
144    return Ok(cow.trim().to_owned());
145  }
146
147  // 按优先级尝试其他编码
148  let encodings = [
149    encoding_rs::GBK,          // 中文简体
150    encoding_rs::BIG5,         // 繁体中文
151    encoding_rs::SHIFT_JIS,    // 日语
152    encoding_rs::EUC_KR,       // 韩语
153    encoding_rs::WINDOWS_1251, // 俄语
154    encoding_rs::WINDOWS_1252, // 西欧
155    encoding_rs::WINDOWS_1256, // 阿拉伯语
156    encoding_rs::WINDOWS_874,  // 泰语
157    encoding_rs::WINDOWS_1258, // 越南语
158    encoding_rs::WINDOWS_1250, // 东欧
159    encoding_rs::WINDOWS_1257, // 波罗的海
160    encoding_rs::WINDOWS_1254, // 土耳其
161    encoding_rs::WINDOWS_1253, // 希腊
162    encoding_rs::WINDOWS_1255, // 希伯来语
163    encoding_rs::WINDOWS_1256, // 阿拉伯语
164  ];
165
166  for &encoding in encodings.iter() {
167    let (cow, _, had_errors) = encoding.decode(input);
168    if !had_errors && !is_garbled(&cow) {
169      return Ok(cow.trim().to_owned());
170    }
171  }
172
173  Err("找不到编码".into())
174}
175
176/// 内联 BOM 检查,避免额外的函数调用
177#[inline(always)]
178pub fn check_bom(input: &[u8]) -> Option<&'static Encoding> {
179  if input.len() < 2 {
180    return None;
181  }
182  match (input[0], input[1]) {
183    (0xFF, 0xFE) => Some(encoding_rs::UTF_16LE),
184    (0xFE, 0xFF) => Some(encoding_rs::UTF_16BE),
185    _ => None,
186  }
187}
188
189/// 检查是否为 UTF-8 编码
190#[inline]
191pub fn is_utf8(input: &[u8]) -> bool {
192  std::str::from_utf8(input).is_ok()
193}
194/// 检查是否为乱码
195#[inline]
196pub fn is_garbled(s: &str) -> bool {
197    if s.is_empty() {
198        return false;
199    }
200
201    let total_chars = s.chars().count();
202    let special_chars = s
203        .chars()
204        .filter(|&c| {
205            // 检查私有使用区 (U+E000 到 U+F8FF)
206            (c as u32 >= 0xE000 && c as u32 <= 0xF8FF) ||
207            // 合并空字符和控制字符的检查
208            (c.is_control() && !matches!(c, '\n' | '\r' | '\t')) ||
209            // 优化非 ASCII 字符的检查
210            (!c.is_ascii() && !is_valid_unicode(c))
211        })
212        .count();
213
214    (special_chars as f32 / total_chars as f32) > 0.4
215}
216
217
218/// 检查是否为有效的 Unicode 字符
219#[inline]
220fn is_valid_unicode(c: char) -> bool {
221    // 检查常见的文字类别
222    c.is_alphabetic() || 
223    c.is_numeric() ||
224    c.is_ascii_punctuation() ||
225    // 检查 CJK 字符范围
226    (c as u32 >= 0x4E00 && c as u32 <= 0x9FFF) ||  // CJK 统一汉字
227    (c as u32 >= 0x3040 && c as u32 <= 0x309F) ||  // 平假名
228    (c as u32 >= 0x30A0 && c as u32 <= 0x30FF) ||  // 片假名
229    (c as u32 >= 0xAC00 && c as u32 <= 0xD7AF) ||  // 韩文音节
230    // Emoji 和符号范围
231    (c as u32 >= 0x1F300 && c as u32 <= 0x1F9FF) || // Emoji 和各种符号
232    (c as u32 >= 0x2600 && c as u32 <= 0x26FF) ||   // 杂项符号
233    (c as u32 >= 0x2700 && c as u32 <= 0x27BF) ||   // 装饰符号
234    (c as u32 >= 0x1F000 && c as u32 <= 0x1F02F) || // 麻将牌
235    (c as u32 >= 0x1F0A0 && c as u32 <= 0x1F0FF) || // 扑克牌
236    (c as u32 >= 0x1F100 && c as u32 <= 0x1F1FF) || // 封闭式字母数字
237    (c as u32 >= 0x1F200 && c as u32 <= 0x1F2FF)    // 封闭式表意文字补充
238}
239/// 获取当前语言
240#[inline]
241pub fn get_lang() -> impl Iterator<Item = String> {
242  sys_locale::get_locales()
243}
244/// 获取语言编码
245#[inline]
246pub fn get_lang_encoding() -> impl Iterator<Item = &'static Encoding> {
247  get_lang().filter_map(|locale| {
248    let locale = locale.to_lowercase();
249    Some(match locale {
250      // 东亚编码
251      l if l.contains("zh-cn") || l.contains("zh-sg") => encoding_rs::GBK,
252      l if l.contains("zh-tw") || l.contains("zh-hk") => encoding_rs::BIG5,
253      l if l.contains("ja") => encoding_rs::SHIFT_JIS,
254      l if l.contains("ko") => encoding_rs::EUC_KR,
255
256      // 西里尔文编码
257      l if l.contains("ru") || l.contains("uk") || l.contains("be") => encoding_rs::WINDOWS_1251,
258
259      // 中东编码
260      l if l.contains("ar") || l.contains("he") || l.contains("fa") => encoding_rs::WINDOWS_1256,
261
262      // 南亚和东南亚编码
263      l if l.contains("th") => encoding_rs::WINDOWS_874,
264      l if l.contains("vi") => encoding_rs::WINDOWS_1258,
265
266      // 欧洲编码
267      // 东欧
268      l if l.contains("cs")
269        || l.contains("hu")
270        || l.contains("pl")
271        || l.contains("ro")
272        || l.contains("hr")
273        || l.contains("sk")
274        || l.contains("sl")
275        || l.contains("sr") =>
276      {
277        encoding_rs::WINDOWS_1250
278      }
279      // 西欧
280      l if l.contains("de")
281        || l.contains("fr")
282        || l.contains("es")
283        || l.contains("it")
284        || l.contains("pt")
285        || l.contains("nl")
286        || l.contains("sv")
287        || l.contains("da")
288        || l.contains("no")
289        || l.contains("fi") =>
290      {
291        encoding_rs::WINDOWS_1252
292      }
293      // 希腊
294      l if l.contains("el") => encoding_rs::WINDOWS_1253,
295      // 土耳其
296      l if l.contains("tr") => encoding_rs::WINDOWS_1254,
297      // 波罗的海
298      l if l.contains("et") || l.contains("lt") || l.contains("lv") => encoding_rs::WINDOWS_1257,
299      // 默认使用 UTF-8
300      _ => encoding_rs::UTF_8,
301    })
302  })
303}
304
305#[cfg(test)]
306mod tests {
307  use super::*;
308  #[test]
309  fn test_auto_decode_utf8() {
310    let input = "Hello, world!".as_bytes();
311    assert_eq!(auto_decode(input).unwrap(), "Hello, world!");
312  }
313
314  #[test]
315  fn test_auto_decode_utf16le() {
316    let input = b"\xFF\xFEH\x00e\x00l\x00l\x00o\x00";
317    assert_eq!(auto_decode(input).unwrap(), "Hello");
318  }
319
320  #[test]
321  fn test_auto_decode_utf16be() {
322    let input = b"\xFE\xFF\x00H\x00e\x00l\x00l\x00o";
323    assert_eq!(auto_decode(input).unwrap(), "Hello");
324  }
325
326  #[test]
327  fn test_auto_decode_ascii() {
328    let input = b"Hello, ASCII!";
329    assert_eq!(auto_decode(input).unwrap(), "Hello, ASCII!");
330  }
331
332  #[test]
333  fn test_auto_decode_empty_input() {
334    let input = b"";
335    assert_eq!(auto_decode(input).unwrap(), "");
336  }
337  #[test]
338  fn test_is_utf8_edge_cases() {
339    assert!(is_utf8(&[0xF0, 0x90, 0x80, 0x80])); // 最小的四字节 UTF-8 序列
340    assert!(is_utf8(&[0xF4, 0x8F, 0xBF, 0xBF])); // 最大的四字节 UTF-8 序列
341    assert!(!is_utf8(&[0xF4, 0x90, 0x80, 0x80])); // 超出 Unicode 范围的四字节序列
342    assert!(!is_utf8(&[0xC0, 0x80])); // 过长编码
343    assert!(!is_utf8(&[0xE0, 0x80, 0x80])); // 过长编码
344  }
345
346  #[tokio::test]
347  async fn test_error_code() -> crate::AnyResult<()> {
348    // 俄语 Windows-1251 乱码
349    tokio::fs::write("target/error_code12.log", encoding_rs::UTF_8.encode("Привет").0).await?;
350    assert_eq!("Привет".to_string(), a_auto_decode_file("target/error_code12.log").await.unwrap());
351    // 繁体中文 Big5
352    tokio::fs::write("target/error_code14.log", encoding_rs::GBK.encode("你好臺灣").0).await?;
353    assert_eq!("你好臺灣".to_string(), a_auto_decode_file("target/error_code14.log").await.unwrap());
354    // 日语 Shift-JIS 乱码
355    tokio::fs::write("target/error_code11.log", encoding_rs::UTF_8.encode("こんにちは").0).await?;
356    assert_eq!("こんにちは".to_string(), a_auto_decode_file("target/error_code11.log").await.unwrap());
357    // 韩语 EUC-KR 乱码
358    tokio::fs::write("target/error_code13.log", encoding_rs::UTF_8.encode("안녕하세요").0).await?;
359    assert_eq!("안녕하세요".to_string(), a_auto_decode_file("target/error_code13.log").await.unwrap());
360    Ok(())
361  }
362
363  #[test]
364  fn test_is_garbled() {
365      // 基础测试
366      assert!(!is_garbled("Hello, 世界!")); // 正常的中英文混合
367      assert!(!is_garbled("")); // 空字符串
368      assert!(!is_garbled("こんにちは")); // 正常的日文
369      assert!(is_garbled("��������")); // 典型的乱码
370  
371      // 多语言测试
372      assert!(!is_garbled("안녕하세요")); // 韩文
373      assert!(!is_garbled("Привет мир")); // 俄文
374      assert!(!is_garbled("مرحبا بالعالم")); // 阿拉伯文
375      assert!(!is_garbled("ยินดีต้อนรับ")); // 泰文
376      
377      // 混合字符测试
378      assert!(!is_garbled("Hello世界こんにちは안녕123!@#")); // 多语言混合
379      assert!(!is_garbled("📱🌍🎉🎨")); // emoji表情
380      assert!(!is_garbled("表情😊混合🌟测试")); // 文字和emoji混合
381      
382      // 特殊字符测试
383      assert!(!is_garbled("\n\r\t")); // 常见控制字符
384      assert!(!is_garbled("Hello\nWorld\r\n")); // 带换行的正常文本
385      assert!(is_garbled("\0\0\0\0\0")); // 连续空字符
386      assert!(!is_garbled("Hello\0World")); // 少量空字符
387      
388      // 边界情况测试
389      assert!(is_garbled("\u{E000}\u{E001}\u{E002}\u{E003}\u{E004}")); // 大量私有使用区字符
390      assert!(!is_garbled("     ")); // 全空格
391      assert!(!is_garbled("!@#$%^&*()")); // 全符号
392      assert!(!is_garbled("1234567890")); // 全数字
393      assert!(!is_garbled("þÿ")); // 拉丁字母扩展
394      
395      // 乱码模式测试
396      assert!(is_garbled("���������")); // 替换字符
397      assert!(is_garbled("\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}")); // Unicode替换字符
398  }
399}