e_utils/system/
encode.rs

use encoding_rs::Encoding;
use once_cell::sync::Lazy;
use std::{env, fs::File, io::Read as _, path::Path};

// 修改 LANG_ENCODING 的定义
static LANG_ENCODING: Lazy<Option<&'static Encoding>> = Lazy::new(get_lang_encoding);

/// Asynchronously reads a file and automatically decodes its content.
///
/// # Example
///
/// ```no_run
/// # use e_utils::system::encode::a_auto_decode_file;
/// # async fn run() -> Result<(), Box<dyn std::error::Error>> {
/// let content = a_auto_decode_file("path/to/file.txt").await?;
/// println!("File content: {}", content);
/// # Ok(())
/// # }
/// ```
#[cfg(feature = "tokio")]
pub async fn a_auto_decode_file<P: AsRef<Path>>(path: P) -> crate::AnyResult<String> {
  use tokio::io::AsyncReadExt as _;
  let mut file = tokio::fs::File::open(path).await?;
  let mut buffer = Vec::new();
  file.read_to_end(&mut buffer).await?;

  // 使用 auto_decode 解码文件内容
  Ok(auto_decode(&buffer)?)
}

/// Synchronously reads a file and automatically decodes its content.
///
/// # Example
///
/// ```no_run
/// # use e_utils::system::encode::auto_decode_file;
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
/// let content = auto_decode_file("path/to/file.txt")?;
/// println!("File content: {}", content);
/// # Ok(())
/// # }
/// ```
pub fn auto_decode_file<P: AsRef<Path>>(path: P) -> crate::AnyResult<String> {
  // 读取文件内容
  let mut file = File::open(path)?;
  let mut buffer = Vec::new();
  file.read_to_end(&mut buffer)?;

  // 使用 auto_decode 解码文件内容
  Ok(auto_decode(&buffer)?)
}

/// Automatically detects encoding and decodes the input byte sequence.
///
/// # Example
///
/// ```
/// # use e_utils::system::encode::auto_decode;
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
/// let bytes = vec![0xE4, 0xBD, 0xA0, 0xE5, 0xA5, 0xBD]; // "你好" in UTF-8
/// let decoded = auto_decode(&bytes)?;
/// assert_eq!(decoded, "你好");
/// # Ok(())
/// # }
/// ```
pub fn auto_decode(input: &[u8]) -> crate::Result<String> {
  let encoding = detect_encoding(input);
  let (cow, encoding_used, had_errors) = encoding.decode(input);
  if had_errors {
    return Err(crate::Error::Decode(format!(
      "Decoding error with {:?}",
      encoding_used
    )));
  }

  let decoded = cow.into_owned();
  if is_garbled(&decoded) {
    return Err(crate::Error::Decode(format!(
      "Garbled text detected with {:?}",
      encoding_used
    )));
  }

  Ok(decoded.trim().to_owned())
}
fn detect_encoding(input: &[u8]) -> &'static Encoding {
  const MAX_CHECK_LEN: usize = 1024;
  let check_input = &input[..input.len().min(MAX_CHECK_LEN)];

  if check_input.is_empty() {
    return encoding_rs::UTF_8;
  }

  // 检查 BOM
  match check_input {
    [0xEF, 0xBB, 0xBF, ..] => return encoding_rs::UTF_8,
    [0xFF, 0xFE, ..] => return encoding_rs::UTF_16LE,
    [0xFE, 0xFF, ..] => return encoding_rs::UTF_16BE,
    _ => {}
  }

  // 快速检查：ASCII 和 UTF-8
  if is_utf8(check_input) {
    return encoding_rs::UTF_8;
  }

  // 使用缓存的 LANG 编码
  if let Some(encoding) = *LANG_ENCODING {
    if !encoding.decode(check_input).2 {
      return encoding;
    }
  }

  // // 尝试其他编码
  let encodings = [encoding_rs::GBK];
  let detected = encodings
    .iter()
    .find(|&&encoding| !encoding.decode(check_input).2)
    .copied()
    .unwrap_or(encoding_rs::UTF_8);
  detected
}

#[inline]
fn is_utf8(input: &[u8]) -> bool {
  std::str::from_utf8(input).is_ok()
}
fn is_garbled(s: &str) -> bool {
  if s.is_empty() {
    return false;
  }

  // 检查字符串中的每个字符
  for &b in s.as_bytes() {
    // UTF-8的有效范围
    if b < 0x20 && b != 0x09 && b != 0x0A && b != 0x0D || b == 0x7F {
      // 不可见字符（除了制表符、换行符和回车符）和DEL字符通常可能是乱码的迹象
      return true;
    }
  }

  // 尝试将字符串转换为字符，检查是否有非法的UTF-8序列
  for c in s.chars() {
    if char::from_u32(c as u32).is_none() {
      // 非法的UTF-8序列
      return true;
    }
  }

  false
}

// 新增这个函数来获取语言编码
fn get_lang_encoding() -> Option<&'static Encoding> {
  env::var("LANG").ok().and_then(|lang| {
    let lang = lang.to_lowercase();
    match lang {
      l if l.contains("zh_cn") || l.contains("zh_sg") => Some(encoding_rs::GBK),
      l if l.contains("zh_tw") || l.contains("zh_hk") => Some(encoding_rs::BIG5),
      l if l.contains("ja") => Some(encoding_rs::SHIFT_JIS),
      l if l.contains("ko") => Some(encoding_rs::EUC_KR),
      l if l.starts_with("ru") => Some(encoding_rs::WINDOWS_1251),
      // ... 其他语言匹配 ...
      _ => None,
    }
  })
}

// fn is_valid_unicode_range(c: char) -> bool {
//   let code = c as u32;
//   (0x4E00..=0x9FFF).contains(&code) || // 常见汉字范围
//     (0x3040..=0x30FF).contains(&code) || // 日语假名范围
//     (0xAC00..=0xD7AF).contains(&code) || // 韩语谚文音节范围
//     (0x0400..=0x04FF).contains(&code) || // 西里尔字母范围
//     (0x0370..=0x03FF).contains(&code) || // 希腊字母范围
//     (0x0600..=0x06FF).contains(&code) || // 阿拉伯字母范围
//     (0x0900..=0x097F).contains(&code) || // 天城文范围
//     c.is_alphanumeric() || // 添加字母数字检查
//     c.is_ascii_punctuation() // 使用 is_ascii_punctuation 替代 is_punctuation
// }

#[cfg(test)]
mod tests {
  use tempdir::TempDir;

  use super::*;
  use std::{env, io::Write as _};

  #[test]
  fn test_auto_decode_utf8() {
    let input = "Hello, world!".as_bytes();
    assert_eq!(auto_decode(input).unwrap(), "Hello, world!");
  }

  #[test]
  fn test_auto_decode_utf8_with_bom() {
    let input = b"\xEF\xBB\xBFHello, world!";
    assert_eq!(auto_decode(input).unwrap(), "Hello, world!");
  }

  #[test]
  fn test_auto_decode_utf16le() {
    let input = b"\xFF\xFEH\x00e\x00l\x00l\x00o\x00";
    assert_eq!(auto_decode(input).unwrap(), "Hello");
  }

  #[test]
  fn test_auto_decode_utf16be() {
    let input = b"\xFE\xFF\x00H\x00e\x00l\x00l\x00o";
    assert_eq!(auto_decode(input).unwrap(), "Hello");
  }

  #[test]
  fn test_auto_decode_ascii() {
    let input = b"Hello, ASCII!";
    assert_eq!(auto_decode(input).unwrap(), "Hello, ASCII!");
  }

  #[test]
  fn test_auto_decode_empty_input() {
    let input = b"";
    assert_eq!(auto_decode(input).unwrap(), "");
  }

  #[test]
  fn test_auto_decode_invalid_utf8() {
    let input = b"\xFF\xFE\xFF\xFE"; // 无效的 UTF-8
    assert!(auto_decode(input).is_ok()); // 现在我们期望它能成功解码，可能使用了其他编码
  }

  #[test]
  fn test_auto_decode_gbk() {
    let input = b"\xC4\xE3\xBA\xC3"; // "你好" in GBK
    env::set_var("LANG", "zh_CN.UTF-8");
    assert_eq!(auto_decode(input).unwrap(), "你好");
  }

  #[test]
  fn test_auto_decode_mixed_encoding() {
    let input = b"Hello, \xC4\xE3\xBA\xC3"; // ASCII + GBK
    env::set_var("LANG", "zh_CN.UTF-8");
    assert_eq!(auto_decode(input).unwrap(), "Hello, 你好");
  }

  #[test]
  fn test_detect_encoding_long_input() {
    let long_input = "A".repeat(2000).into_bytes();
    assert_eq!(detect_encoding(&long_input), encoding_rs::UTF_8);
  }

  #[test]
  fn test_detect_encoding_with_non_bmp_characters() {
    let input = "😀🌍🚀".as_bytes(); // 包含非 BMP 字符的 UTF-8
    assert_eq!(detect_encoding(input), encoding_rs::UTF_8);
  }

  #[test]
  fn test_is_utf8() {
    assert!(is_utf8("Hello, UTF-8!".as_bytes()));
    assert!(is_utf8("你好，UTF-8！".as_bytes()));
    assert!(is_utf8("🌍🌎🌏".as_bytes()));
    assert!(!is_utf8(b"\xFF\xFE\x00\x00"));
  }

  #[test]
  fn test_is_utf8_edge_cases() {
    assert!(is_utf8(&[0xF0, 0x90, 0x80, 0x80])); // 最小的四字节 UTF-8 序列
    assert!(is_utf8(&[0xF4, 0x8F, 0xBF, 0xBF])); // 最大的四字节 UTF-8 序列
    assert!(!is_utf8(&[0xF4, 0x90, 0x80, 0x80])); // 超出 Unicode 范围的四字节序列
    assert!(!is_utf8(&[0xC0, 0x80])); // 过长编码
    assert!(!is_utf8(&[0xE0, 0x80, 0x80])); // 过长编码
  }

  #[test]
  fn test_get_lang_encoding() {
    env::set_var("LANG", "zh_CN.UTF-8");
    assert_eq!(get_lang_encoding(), Some(encoding_rs::GBK));

    env::set_var("LANG", "ja_JP.UTF-8");
    assert_eq!(get_lang_encoding(), Some(encoding_rs::SHIFT_JIS));

    env::set_var("LANG", "en_US.UTF-8");
    assert_eq!(get_lang_encoding(), None);
  }

  #[test]
  fn test_get_lang_encoding_case_insensitive() {
    env::set_var("LANG", "ZH_CN.UTF-8");
    assert_eq!(get_lang_encoding(), Some(encoding_rs::GBK));
    env::set_var("LANG", "JA_jp.utf-8");
    assert_eq!(get_lang_encoding(), Some(encoding_rs::SHIFT_JIS));
  }

  #[test]
  fn test_get_lang_encoding_invalid_lang() {
    env::set_var("LANG", "invalid.UTF-8");
    assert_eq!(get_lang_encoding(), None);
  }

  #[test]
  fn test_not_is_garbled() {
    assert!(
      !is_garbled("Hello, world!"),
      "ASCII text should not be garbled"
    );
    assert!(
      !is_garbled("你好，世界！"),
      "Chinese text should not be garbled"
    );
    assert!(
      !is_garbled("こんにちは、世界！"),
      "Japanese text should not be garbled"
    );
    assert!(
      !is_garbled("안녕하세요, 세계!"),
      "Korean text should not be garbled"
    );
    assert!(
      !is_garbled("Здравствуй, мир!"),
      "Russian text should not be garbled"
    );
    assert!(
      !is_garbled("Hello, !"),
      "Text with replacement characters should be garbled"
    );
  }
  #[test]
  fn test_is_garbled() {
    assert!(
      is_garbled("\x01\x02\x03\x04\x05"),
      "Text with control characters should be garbled"
    );
    assert!(!is_garbled("    "), "Whitespace should not be garbled");
    assert!(!is_garbled(""), "Empty string should not be garbled");
  }

  fn create_test_file(dir: &TempDir, filename: &str, content: &[u8]) -> std::path::PathBuf {
    let file_path = dir.path().join(filename);
    let mut file = File::create(&file_path).unwrap();
    file.write_all(content).unwrap();
    file_path
  }

  #[test]
  fn test_auto_decode_file() {
    let temp_dir = TempDir::new("test_auto_decode_file").unwrap();

    // 测试 UTF-8 文件
    let utf8_path = create_test_file(&temp_dir, "utf8.txt", "你好，世界！".as_bytes());
    assert_eq!(auto_decode_file(utf8_path).unwrap(), "你好，世界！");

    // 测试 GBK 文件
    let gbk_path = create_test_file(
      &temp_dir,
      "gbk.txt",
      b"\xC4\xE3\xBA\xC3\xA3\xAC\xCA\xC0\xBD\xE7\xA3\xA1",
    );
    assert_eq!(auto_decode_file(gbk_path).unwrap(), "你好，世界！");

    // 测试空文件
    let empty_path = create_test_file(&temp_dir, "empty.txt", b"");
    assert_eq!(auto_decode_file(empty_path).unwrap(), "");

    // 测试不存在的文件
    let result = auto_decode_file("non_existent_file.txt");
    assert!(result.is_err());
  }

  #[cfg(feature = "tokio")]
  #[tokio::test]
  async fn test_a_auto_decode_file() {
    let temp_dir = TempDir::new("test_a_auto_decode_file").unwrap();

    // 测试 UTF-8 文件
    let utf8_path = create_test_file(&temp_dir, "utf8_async.txt", "你好，世界！".as_bytes());
    assert_eq!(a_auto_decode_file(utf8_path).await.unwrap(), "你好，世界！");

    // 测试 GBK 文件
    let gbk_path = create_test_file(
      &temp_dir,
      "gbk_async.txt",
      b"\xC4\xE3\xBA\xC3\xA3\xAC\xCA\xC0\xBD\xE7\xA3\xA1",
    );
    assert_eq!(a_auto_decode_file(gbk_path).await.unwrap(), "你好，世界！");

    // 测试空文件
    let empty_path = create_test_file(&temp_dir, "empty_async.txt", b"");
    assert_eq!(a_auto_decode_file(empty_path).await.unwrap(), "");

    // 测试不存在的文件
    let result = a_auto_decode_file("non_existent_file_async.txt").await;
    assert!(result.is_err());
  }
}
e_utils/system/encode.rs

e_utils/system/
encode.rs