subx_cli/core/formats/encoding/
converter.rs

1use crate::Result;
2use crate::core::formats::encoding::charset::{Charset, EncodingInfo};
3use anyhow::anyhow;
4use encoding_rs::{BIG5, Encoding, GBK, ISO_8859_2, SHIFT_JIS, UTF_8, WINDOWS_1252};
5use std::collections::HashMap;
6
7/// 編碼轉換結果
8#[derive(Debug, Clone)]
9pub struct ConversionResult {
10    pub converted_text: String,
11    pub original_encoding: Charset,
12    pub target_encoding: Charset,
13    pub bytes_processed: usize,
14    pub had_errors: bool,
15    pub error_count: usize,
16}
17
18/// 編碼轉換器
19pub struct EncodingConverter {
20    encoding_map: HashMap<Charset, &'static Encoding>,
21}
22
23impl EncodingConverter {
24    /// 建立轉換器並初始化編碼映射
25    pub fn new() -> Self {
26        let mut encoding_map = HashMap::new();
27        encoding_map.insert(Charset::Utf8, UTF_8);
28        encoding_map.insert(Charset::Gbk, GBK);
29        encoding_map.insert(Charset::ShiftJis, SHIFT_JIS);
30        encoding_map.insert(Charset::Big5, BIG5);
31        encoding_map.insert(Charset::Windows1252, WINDOWS_1252);
32        encoding_map.insert(Charset::Iso88591, ISO_8859_2);
33        Self { encoding_map }
34    }
35
36    /// 將資料轉換為 UTF-8
37    pub fn convert_to_utf8(
38        &self,
39        data: &[u8],
40        source_encoding: &Charset,
41    ) -> Result<ConversionResult> {
42        if *source_encoding == Charset::Utf8 {
43            return Ok(ConversionResult {
44                converted_text: String::from_utf8_lossy(data).to_string(),
45                original_encoding: Charset::Utf8,
46                target_encoding: Charset::Utf8,
47                bytes_processed: data.len(),
48                had_errors: false,
49                error_count: 0,
50            });
51        }
52        let encoding = self
53            .encoding_map
54            .get(source_encoding)
55            .ok_or_else(|| anyhow!("Unsupported encoding: {:?}", source_encoding))?;
56        let (converted, _, had_errors) = encoding.decode(data);
57        let error_count = if had_errors {
58            self.count_replacement_chars(&converted)
59        } else {
60            0
61        };
62        Ok(ConversionResult {
63            converted_text: converted.into_owned(),
64            original_encoding: source_encoding.clone(),
65            target_encoding: Charset::Utf8,
66            bytes_processed: data.len(),
67            had_errors,
68            error_count,
69        })
70    }
71
72    /// 將檔案內容轉換為 UTF-8
73    pub fn convert_file_to_utf8(
74        &self,
75        file_path: &str,
76        encoding_info: &EncodingInfo,
77    ) -> Result<ConversionResult> {
78        let data = std::fs::read(file_path)?;
79        let slice = if encoding_info.bom_detected {
80            self.skip_bom(&data, &encoding_info.charset)
81        } else {
82            data.as_slice()
83        };
84        self.convert_to_utf8(slice, &encoding_info.charset)
85    }
86
87    fn skip_bom<'a>(&self, data: &'a [u8], charset: &Charset) -> &'a [u8] {
88        match charset {
89            Charset::Utf8 if data.starts_with(&[0xEF, 0xBB, 0xBF]) => &data[3..],
90            Charset::Utf16Le if data.starts_with(&[0xFF, 0xFE]) => &data[2..],
91            Charset::Utf16Be if data.starts_with(&[0xFE, 0xFF]) => &data[2..],
92            Charset::Utf32Le if data.starts_with(&[0xFF, 0xFE, 0x00, 0x00]) => &data[4..],
93            Charset::Utf32Be if data.starts_with(&[0x00, 0x00, 0xFE, 0xFF]) => &data[4..],
94            _ => data,
95        }
96    }
97
98    fn count_replacement_chars(&self, text: &str) -> usize {
99        text.chars().filter(|&c| c == '\u{FFFD}').count()
100    }
101
102    /// 驗證轉換結果
103    pub fn validate_conversion(&self, result: &ConversionResult) -> ValidationResult {
104        ValidationResult {
105            is_valid: !result.had_errors || result.error_count == 0,
106            confidence: if result.had_errors {
107                1.0 - result.error_count as f32 / result.converted_text.len() as f32
108            } else {
109                1.0
110            },
111            warnings: self.generate_warnings(result),
112        }
113    }
114
115    fn generate_warnings(&self, result: &ConversionResult) -> Vec<String> {
116        let mut warnings = Vec::new();
117        if result.had_errors {
118            warnings.push(format!(
119                "Encoding conversion had {} replacement characters",
120                result.error_count
121            ));
122        }
123        if result.error_count > result.bytes_processed / 10 {
124            warnings.push("High error rate detected - encoding may be incorrect".to_string());
125        }
126        warnings
127    }
128}
129
130/// 驗證結果結構
131#[derive(Debug, Clone)]
132pub struct ValidationResult {
133    pub is_valid: bool,
134    pub confidence: f32,
135    pub warnings: Vec<String>,
136}
137
138impl Default for EncodingConverter {
139    fn default() -> Self {
140        Self::new()
141    }
142}