e-utils 0.4.19 - Docs.rs

use encoding_rs::Encoding;
use once_cell::sync::Lazy;
use std::{fs::File, io::Read as _, path::Path};

// 修改 LANG_ENCODING 的定义
static LANG_ENCODING: Lazy<&'static Encoding> = Lazy::new(|| get_lang_encoding().next().unwrap_or(encoding_rs::UTF_8));

/// Asynchronously reads a file and automatically decodes its content.
///
/// # Example
///
/// ```no_run
/// # use e_utils::system::encode::a_auto_decode_file;
/// # async fn run() -> Result<(), Box<dyn std::error::Error>> {
/// let content = a_auto_decode_file("path/to/file.txt").await?;
/// println!("File content: {}", content);
/// # Ok(())
/// # }
/// ```
#[cfg(feature = "tokio")]
pub async fn a_auto_decode_file<P: AsRef<Path>>(path: P) -> crate::AnyResult<String> {
  use tokio::io::AsyncReadExt as _;
  let mut file = tokio::fs::File::open(path).await?;
  let mut buffer = Vec::new();
  file.read_to_end(&mut buffer).await?;

  // 使用 auto_decode 解码文件内容
  Ok(auto_decode(&buffer).unwrap_or_else(|_| String::from_utf8_lossy(&buffer).to_string()))
}

/// Synchronously reads a file and automatically decodes its content.
///
/// # Example
///
/// ```no_run
/// # use e_utils::system::encode::auto_decode_file;
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
/// let content = auto_decode_file("path/to/file.txt")?;
/// println!("File content: {}", content);
/// # Ok(())
/// # }
/// ```
pub fn auto_decode_file<P: AsRef<Path>>(path: P) -> crate::AnyResult<String> {
  // 读取文件内容
  let mut file = File::open(path)?;
  let mut buffer = Vec::new();
  file.read_to_end(&mut buffer)?;

  // 使用 auto_decode 解码文件内容
  Ok(auto_decode(&buffer).unwrap_or_else(|_| String::from_utf8_lossy(&buffer).to_string()))
}

/// Automatically detects encoding and decodes the input byte sequence.
///
/// # Example
///
/// ```
/// # use e_utils::system::encode::auto_decode;
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
/// let bytes = vec![0xE4, 0xBD, 0xA0, 0xE5, 0xA5, 0xBD]; // "你好" in UTF-8
/// let decoded = auto_decode(&bytes)?;
/// assert_eq!(decoded, "你好");
/// # Ok(())
/// # }
/// ```
pub fn auto_decode(input: &[u8]) -> crate::Result<String> {
  let utf8 = encoding_rs::UTF_8.name();
  if LANG_ENCODING.name() != utf8 {
    auto_decode_all(input)
  } else {
    auto_decode_simple(input)
  }
}

/// Automatically detects encoding and decodes the input byte sequence.
///
/// # Example
///
/// ```
/// # use e_utils::system::encode::auto_decode_simple;
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
/// let bytes = vec![0xE4, 0xBD, 0xA0, 0xE5, 0xA5, 0xBD]; // "你好" in UTF-8
/// let decoded = auto_decode_simple(&bytes)?;
/// assert_eq!(decoded, "你好");
/// # Ok(())
/// # }
/// ```
pub fn auto_decode_simple(input: &[u8]) -> crate::Result<String> {
  // 快速路径：空输入和 UTF-8
  if input.is_empty() {
    return Ok(String::new());
  }

  if let Ok(s) = std::str::from_utf8(input) {
    if !is_garbled(&s) {
      return Ok(s.trim().to_owned());
    }
  }
  // 优先尝试系统语言编码
  let (cow, _, had_errors) = LANG_ENCODING.decode(input);
  if !had_errors && !is_garbled(&cow) {
    return Ok(cow.trim().to_owned());
  }
  let (cow, _, had_errors) = encoding_rs::GBK.decode(input);
  if !had_errors && !is_garbled(&cow) {
    return Ok(cow.trim().to_owned());
  }
  Err("找不到编码".into())
}
/// Automatically detects encoding and decodes the input byte sequence.
///
/// # Example
///
/// ```
/// # use e_utils::system::encode::auto_decode_all;
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
/// let bytes = vec![0xE4, 0xBD, 0xA0, 0xE5, 0xA5, 0xBD]; // "你好" in UTF-8
/// let decoded = auto_decode_all(&bytes)?;
/// assert_eq!(decoded, "你好");
/// # Ok(())
/// # }
/// ```
pub fn auto_decode_all(input: &[u8]) -> crate::Result<String> {
  // 快速路径：空输入和 UTF-8
  if input.is_empty() {
    return Ok(String::new());
  }

  // BOM 检查
  if let Some(encoding) = check_bom(input) {
    let (decoded, _, had_errors) = encoding.decode(input);
    if !had_errors && !is_garbled(&decoded) {
      return Ok(decoded.trim().to_owned());
    }
  }
  if let Ok(s) = std::str::from_utf8(input) {
    if !is_garbled(&s) {
      return Ok(s.trim().to_owned());
    }
  }
  // 尝试系统语言编码
  let (cow, _, had_errors) = LANG_ENCODING.decode(input);
  if !had_errors {
    return Ok(cow.trim().to_owned());
  }

  // 按优先级尝试其他编码
  let encodings = [
    encoding_rs::GBK,          // 中文简体
    encoding_rs::BIG5,         // 繁体中文
    encoding_rs::SHIFT_JIS,    // 日语
    encoding_rs::EUC_KR,       // 韩语
    encoding_rs::WINDOWS_1251, // 俄语
    encoding_rs::WINDOWS_1252, // 西欧
    encoding_rs::WINDOWS_1256, // 阿拉伯语
    encoding_rs::WINDOWS_874,  // 泰语
    encoding_rs::WINDOWS_1258, // 越南语
    encoding_rs::WINDOWS_1250, // 东欧
    encoding_rs::WINDOWS_1257, // 波罗的海
    encoding_rs::WINDOWS_1254, // 土耳其
    encoding_rs::WINDOWS_1253, // 希腊
    encoding_rs::WINDOWS_1255, // 希伯来语
    encoding_rs::WINDOWS_1256, // 阿拉伯语
  ];

  for &encoding in encodings.iter() {
    let (cow, _, had_errors) = encoding.decode(input);
    if !had_errors && !is_garbled(&cow) {
      return Ok(cow.trim().to_owned());
    }
  }

  Err("找不到编码".into())
}

/// 内联 BOM 检查，避免额外的函数调用
#[inline(always)]
pub fn check_bom(input: &[u8]) -> Option<&'static Encoding> {
  if input.len() < 2 {
    return None;
  }
  match (input[0], input[1]) {
    (0xFF, 0xFE) => Some(encoding_rs::UTF_16LE),
    (0xFE, 0xFF) => Some(encoding_rs::UTF_16BE),
    _ => None,
  }
}

/// 检查是否为 UTF-8 编码
#[inline]
pub fn is_utf8(input: &[u8]) -> bool {
  std::str::from_utf8(input).is_ok()
}
/// 检查是否为乱码
#[inline]
pub fn is_garbled(s: &str) -> bool {
    if s.is_empty() {
        return false;
    }

    let total_chars = s.chars().count();
    let special_chars = s
        .chars()
        .filter(|&c| {
            // 检查私有使用区 (U+E000 到 U+F8FF)
            (c as u32 >= 0xE000 && c as u32 <= 0xF8FF) ||
            // 合并空字符和控制字符的检查
            (c.is_control() && !matches!(c, '\n' | '\r' | '\t')) ||
            // 优化非 ASCII 字符的检查
            (!c.is_ascii() && !is_valid_unicode(c))
        })
        .count();

    (special_chars as f32 / total_chars as f32) > 0.4
}


/// 检查是否为有效的 Unicode 字符
#[inline]
fn is_valid_unicode(c: char) -> bool {
    // 检查常见的文字类别
    c.is_alphabetic() || 
    c.is_numeric() ||
    c.is_ascii_punctuation() ||
    // 检查 CJK 字符范围
    (c as u32 >= 0x4E00 && c as u32 <= 0x9FFF) ||  // CJK 统一汉字
    (c as u32 >= 0x3040 && c as u32 <= 0x309F) ||  // 平假名
    (c as u32 >= 0x30A0 && c as u32 <= 0x30FF) ||  // 片假名
    (c as u32 >= 0xAC00 && c as u32 <= 0xD7AF) ||  // 韩文音节
    // Emoji 和符号范围
    (c as u32 >= 0x1F300 && c as u32 <= 0x1F9FF) || // Emoji 和各种符号
    (c as u32 >= 0x2600 && c as u32 <= 0x26FF) ||   // 杂项符号
    (c as u32 >= 0x2700 && c as u32 <= 0x27BF) ||   // 装饰符号
    (c as u32 >= 0x1F000 && c as u32 <= 0x1F02F) || // 麻将牌
    (c as u32 >= 0x1F0A0 && c as u32 <= 0x1F0FF) || // 扑克牌
    (c as u32 >= 0x1F100 && c as u32 <= 0x1F1FF) || // 封闭式字母数字
    (c as u32 >= 0x1F200 && c as u32 <= 0x1F2FF)    // 封闭式表意文字补充
}
/// 获取当前语言
#[inline]
pub fn get_lang() -> impl Iterator<Item = String> {
  sys_locale::get_locales()
}
/// 获取语言编码
#[inline]
pub fn get_lang_encoding() -> impl Iterator<Item = &'static Encoding> {
  get_lang().filter_map(|locale| {
    let locale = locale.to_lowercase();
    Some(match locale {
      // 东亚编码
      l if l.contains("zh-cn") || l.contains("zh-sg") => encoding_rs::GBK,
      l if l.contains("zh-tw") || l.contains("zh-hk") => encoding_rs::BIG5,
      l if l.contains("ja") => encoding_rs::SHIFT_JIS,
      l if l.contains("ko") => encoding_rs::EUC_KR,

      // 西里尔文编码
      l if l.contains("ru") || l.contains("uk") || l.contains("be") => encoding_rs::WINDOWS_1251,

      // 中东编码
      l if l.contains("ar") || l.contains("he") || l.contains("fa") => encoding_rs::WINDOWS_1256,

      // 南亚和东南亚编码
      l if l.contains("th") => encoding_rs::WINDOWS_874,
      l if l.contains("vi") => encoding_rs::WINDOWS_1258,

      // 欧洲编码
      // 东欧
      l if l.contains("cs")
        || l.contains("hu")
        || l.contains("pl")
        || l.contains("ro")
        || l.contains("hr")
        || l.contains("sk")
        || l.contains("sl")
        || l.contains("sr") =>
      {
        encoding_rs::WINDOWS_1250
      }
      // 西欧
      l if l.contains("de")
        || l.contains("fr")
        || l.contains("es")
        || l.contains("it")
        || l.contains("pt")
        || l.contains("nl")
        || l.contains("sv")
        || l.contains("da")
        || l.contains("no")
        || l.contains("fi") =>
      {
        encoding_rs::WINDOWS_1252
      }
      // 希腊
      l if l.contains("el") => encoding_rs::WINDOWS_1253,
      // 土耳其
      l if l.contains("tr") => encoding_rs::WINDOWS_1254,
      // 波罗的海
      l if l.contains("et") || l.contains("lt") || l.contains("lv") => encoding_rs::WINDOWS_1257,
      // 默认使用 UTF-8
      _ => encoding_rs::UTF_8,
    })
  })
}

#[cfg(test)]
mod tests {
  use super::*;
  #[test]
  fn test_auto_decode_utf8() {
    let input = "Hello, world!".as_bytes();
    assert_eq!(auto_decode(input).unwrap(), "Hello, world!");
  }

  #[test]
  fn test_auto_decode_utf16le() {
    let input = b"\xFF\xFEH\x00e\x00l\x00l\x00o\x00";
    assert_eq!(auto_decode(input).unwrap(), "Hello");
  }

  #[test]
  fn test_auto_decode_utf16be() {
    let input = b"\xFE\xFF\x00H\x00e\x00l\x00l\x00o";
    assert_eq!(auto_decode(input).unwrap(), "Hello");
  }

  #[test]
  fn test_auto_decode_ascii() {
    let input = b"Hello, ASCII!";
    assert_eq!(auto_decode(input).unwrap(), "Hello, ASCII!");
  }

  #[test]
  fn test_auto_decode_empty_input() {
    let input = b"";
    assert_eq!(auto_decode(input).unwrap(), "");
  }
  #[test]
  fn test_is_utf8_edge_cases() {
    assert!(is_utf8(&[0xF0, 0x90, 0x80, 0x80])); // 最小的四字节 UTF-8 序列
    assert!(is_utf8(&[0xF4, 0x8F, 0xBF, 0xBF])); // 最大的四字节 UTF-8 序列
    assert!(!is_utf8(&[0xF4, 0x90, 0x80, 0x80])); // 超出 Unicode 范围的四字节序列
    assert!(!is_utf8(&[0xC0, 0x80])); // 过长编码
    assert!(!is_utf8(&[0xE0, 0x80, 0x80])); // 过长编码
  }

  #[tokio::test]
  async fn test_error_code() -> crate::AnyResult<()> {
    // 俄语 Windows-1251 乱码
    tokio::fs::write("target/error_code12.log", encoding_rs::UTF_8.encode("Привет").0).await?;
    assert_eq!("Привет".to_string(), a_auto_decode_file("target/error_code12.log").await.unwrap());
    // 繁体中文 Big5
    tokio::fs::write("target/error_code14.log", encoding_rs::GBK.encode("你好臺灣").0).await?;
    assert_eq!("你好臺灣".to_string(), a_auto_decode_file("target/error_code14.log").await.unwrap());
    // 日语 Shift-JIS 乱码
    tokio::fs::write("target/error_code11.log", encoding_rs::UTF_8.encode("こんにちは").0).await?;
    assert_eq!("こんにちは".to_string(), a_auto_decode_file("target/error_code11.log").await.unwrap());
    // 韩语 EUC-KR 乱码
    tokio::fs::write("target/error_code13.log", encoding_rs::UTF_8.encode("안녕하세요").0).await?;
    assert_eq!("안녕하세요".to_string(), a_auto_decode_file("target/error_code13.log").await.unwrap());
    Ok(())
  }

  #[test]
  fn test_is_garbled() {
      // 基础测试
      assert!(!is_garbled("Hello, 世界！")); // 正常的中英文混合
      assert!(!is_garbled("")); // 空字符串
      assert!(!is_garbled("こんにちは")); // 正常的日文
      assert!(is_garbled("��������")); // 典型的乱码
  
      // 多语言测试
      assert!(!is_garbled("안녕하세요")); // 韩文
      assert!(!is_garbled("Привет мир")); // 俄文
      assert!(!is_garbled("مرحبا بالعالم")); // 阿拉伯文
      assert!(!is_garbled("ยินดีต้อนรับ")); // 泰文
      
      // 混合字符测试
      assert!(!is_garbled("Hello世界こんにちは안녕123!@#")); // 多语言混合
      assert!(!is_garbled("📱🌍🎉🎨")); // emoji表情
      assert!(!is_garbled("表情😊混合🌟测试")); // 文字和emoji混合
      
      // 特殊字符测试
      assert!(!is_garbled("\n\r\t")); // 常见控制字符
      assert!(!is_garbled("Hello\nWorld\r\n")); // 带换行的正常文本
      assert!(is_garbled("\0\0\0\0\0")); // 连续空字符
      assert!(!is_garbled("Hello\0World")); // 少量空字符
      
      // 边界情况测试
      assert!(is_garbled("\u{E000}\u{E001}\u{E002}\u{E003}\u{E004}")); // 大量私有使用区字符
      assert!(!is_garbled("     ")); // 全空格
      assert!(!is_garbled("!@#$%^&*()")); // 全符号
      assert!(!is_garbled("1234567890")); // 全数字
      assert!(!is_garbled("þÿ")); // 拉丁字母扩展
      
      // 乱码模式测试
      assert!(is_garbled("���������")); // 替换字符
      assert!(is_garbled("\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}")); // Unicode替换字符
  }
}