subx_cli/core/formats/encoding/
detector.rs

1use crate::Result;
2use crate::config::load_config;
3use crate::core::formats::encoding::charset::{Charset, EncodingInfo};
4use std::fs::File;
5use std::io::Read;
6
7/// 編碼檢測引擎
8pub struct EncodingDetector {
9    confidence_threshold: f32,
10    max_sample_size: usize,
11    supported_charsets: Vec<Charset>,
12}
13
14impl EncodingDetector {
15    /// 建立編碼檢測器,從配置讀取信心度閾值
16    pub fn new() -> Result<Self> {
17        let config = load_config()?;
18        Ok(Self {
19            confidence_threshold: config.formats.encoding_detection_confidence,
20            max_sample_size: 8192,
21            supported_charsets: Self::default_charsets(),
22        })
23    }
24
25    /// 檢測檔案編碼
26    pub fn detect_file_encoding(&self, file_path: &str) -> Result<EncodingInfo> {
27        let mut file = File::open(file_path)?;
28        let mut buffer = vec![0; self.max_sample_size];
29        let bytes_read = file.read(&mut buffer)?;
30        buffer.truncate(bytes_read);
31        self.detect_encoding(&buffer)
32    }
33
34    /// 檢測資料編碼
35    pub fn detect_encoding(&self, data: &[u8]) -> Result<EncodingInfo> {
36        if let Some(encoding) = self.detect_bom(data) {
37            return Ok(encoding);
38        }
39        let candidates = self.analyze_byte_patterns(data)?;
40        self.select_best_encoding(candidates, data)
41    }
42
43    fn detect_bom(&self, data: &[u8]) -> Option<EncodingInfo> {
44        if data.len() < 3 {
45            return None;
46        }
47        match &data[0..3] {
48            [0xEF, 0xBB, 0xBF] => Some(EncodingInfo {
49                charset: Charset::Utf8,
50                confidence: 1.0,
51                bom_detected: true,
52                sample_text: String::from("UTF-8 with BOM"),
53            }),
54            [0xFF, 0xFE, ..] => Some(EncodingInfo {
55                charset: Charset::Utf16Le,
56                confidence: 1.0,
57                bom_detected: true,
58                sample_text: String::from("UTF-16 LE with BOM"),
59            }),
60            [0xFE, 0xFF, ..] => Some(EncodingInfo {
61                charset: Charset::Utf16Be,
62                confidence: 1.0,
63                bom_detected: true,
64                sample_text: String::from("UTF-16 BE with BOM"),
65            }),
66            _ => {
67                if data.len() >= 4 {
68                    match &data[0..4] {
69                        [0xFF, 0xFE, 0x00, 0x00] => Some(EncodingInfo {
70                            charset: Charset::Utf32Le,
71                            confidence: 1.0,
72                            bom_detected: true,
73                            sample_text: String::from("UTF-32 LE with BOM"),
74                        }),
75                        [0x00, 0x00, 0xFE, 0xFF] => Some(EncodingInfo {
76                            charset: Charset::Utf32Be,
77                            confidence: 1.0,
78                            bom_detected: true,
79                            sample_text: String::from("UTF-32 BE with BOM"),
80                        }),
81                        _ => None,
82                    }
83                } else {
84                    None
85                }
86            }
87        }
88    }
89
90    fn analyze_byte_patterns(&self, data: &[u8]) -> Result<Vec<EncodingCandidate>> {
91        let mut candidates = Vec::new();
92        for charset in &self.supported_charsets {
93            let confidence = self.calculate_encoding_confidence(data, charset)?;
94            if confidence > 0.1 {
95                candidates.push(EncodingCandidate {
96                    charset: charset.clone(),
97                    confidence,
98                });
99            }
100        }
101        candidates.sort_by(|a, b| b.confidence.partial_cmp(&a.confidence).unwrap());
102        Ok(candidates)
103    }
104
105    fn calculate_encoding_confidence(&self, data: &[u8], charset: &Charset) -> Result<f32> {
106        match charset {
107            Charset::Utf8 => self.check_utf8_validity(data),
108            Charset::Gbk => self.check_gbk_patterns(data),
109            Charset::ShiftJis => self.check_shift_jis_patterns(data),
110            Charset::Big5 => self.check_big5_patterns(data),
111            Charset::Iso88591 => self.check_iso88591_patterns(data),
112            Charset::Windows1252 => self.check_windows1252_patterns(data),
113            _ => Ok(0.0),
114        }
115    }
116
117    fn check_utf8_validity(&self, data: &[u8]) -> Result<f32> {
118        let mut valid_chars = 0;
119        let mut total_chars = 0;
120        let mut i = 0;
121
122        while i < data.len() {
123            total_chars += 1;
124            if data[i] & 0x80 == 0 {
125                valid_chars += 1;
126                i += 1;
127            } else if data[i] & 0xE0 == 0xC0 {
128                if i + 1 < data.len() && data[i + 1] & 0xC0 == 0x80 {
129                    valid_chars += 1;
130                }
131                i += 2;
132            } else if data[i] & 0xF0 == 0xE0 {
133                if i + 2 < data.len() && data[i + 1] & 0xC0 == 0x80 && data[i + 2] & 0xC0 == 0x80 {
134                    valid_chars += 1;
135                }
136                i += 3;
137            } else if data[i] & 0xF8 == 0xF0 {
138                if i + 3 < data.len()
139                    && data[i + 1] & 0xC0 == 0x80
140                    && data[i + 2] & 0xC0 == 0x80
141                    && data[i + 3] & 0xC0 == 0x80
142                {
143                    valid_chars += 1;
144                }
145                i += 4;
146            } else {
147                i += 1;
148            }
149        }
150
151        Ok(if total_chars > 0 {
152            valid_chars as f32 / total_chars as f32
153        } else {
154            0.0
155        })
156    }
157
158    fn check_gbk_patterns(&self, data: &[u8]) -> Result<f32> {
159        let mut valid_chars = 0;
160        let mut total_chars = 0;
161        let mut i = 0;
162
163        while i < data.len() {
164            if data[i] < 0x80 {
165                valid_chars += 1;
166                total_chars += 1;
167                i += 1;
168            } else if i + 1 < data.len() {
169                let byte1 = data[i];
170                let byte2 = data[i + 1];
171                if (0x81..=0xFE).contains(&byte1)
172                    && ((0x40..=0x7E).contains(&byte2) || (0x80..=0xFE).contains(&byte2))
173                {
174                    valid_chars += 1;
175                }
176                total_chars += 1;
177                i += 2;
178            } else {
179                total_chars += 1;
180                i += 1;
181            }
182        }
183
184        Ok(if total_chars > 0 {
185            valid_chars as f32 / total_chars as f32
186        } else {
187            0.0
188        })
189    }
190
191    fn check_shift_jis_patterns(&self, data: &[u8]) -> Result<f32> {
192        let mut valid_chars = 0;
193        let mut total_chars = 0;
194        let mut i = 0;
195
196        while i < data.len() {
197            if data[i] < 0x80 {
198                valid_chars += 1;
199                total_chars += 1;
200                i += 1;
201            } else if i + 1 < data.len() {
202                let byte1 = data[i];
203                let byte2 = data[i + 1];
204                if ((0x81..=0x9F).contains(&byte1) || (0xE0..=0xEF).contains(&byte1))
205                    && (0x40..=0xFC).contains(&byte2)
206                    && byte2 != 0x7F
207                {
208                    valid_chars += 1;
209                }
210                total_chars += 1;
211                i += 2;
212            } else {
213                total_chars += 1;
214                i += 1;
215            }
216        }
217
218        Ok(if total_chars > 0 {
219            valid_chars as f32 / total_chars as f32
220        } else {
221            0.0
222        })
223    }
224
225    fn check_big5_patterns(&self, data: &[u8]) -> Result<f32> {
226        let mut valid_chars = 0;
227        let mut total_chars = 0;
228        let mut i = 0;
229
230        while i < data.len() {
231            if data[i] < 0x80 {
232                valid_chars += 1;
233                total_chars += 1;
234                i += 1;
235            } else if i + 1 < data.len() {
236                let byte1 = data[i];
237                let byte2 = data[i + 1];
238                if (0xA1..=0xFE).contains(&byte1)
239                    && ((0x40..=0x7E).contains(&byte2) || (0xA1..=0xFE).contains(&byte2))
240                {
241                    valid_chars += 1;
242                }
243                total_chars += 1;
244                i += 2;
245            } else {
246                total_chars += 1;
247                i += 1;
248            }
249        }
250
251        Ok(if total_chars > 0 {
252            valid_chars as f32 / total_chars as f32
253        } else {
254            0.0
255        })
256    }
257
258    fn check_iso88591_patterns(&self, data: &[u8]) -> Result<f32> {
259        let _ascii_count = data.iter().filter(|&&b| b < 0x80).count();
260        let extended_count = data.iter().filter(|&&b| b >= 0x80).count();
261        if extended_count > 0 {
262            let utf8_conf = self.check_utf8_validity(data)?;
263            Ok(if utf8_conf < 0.5 { 0.7 } else { 0.2 })
264        } else {
265            Ok(0.5)
266        }
267    }
268
269    fn check_windows1252_patterns(&self, data: &[u8]) -> Result<f32> {
270        let control_chars = data.iter().filter(|&&b| (0x80..=0x9F).contains(&b)).count();
271        let extended_chars = data.iter().filter(|&&b| b >= 0xA0).count();
272        if control_chars > 0 || extended_chars > 0 {
273            let utf8_conf = self.check_utf8_validity(data)?;
274            Ok(if utf8_conf < 0.5 { 0.6 } else { 0.1 })
275        } else {
276            Ok(0.3)
277        }
278    }
279
280    fn select_best_encoding(
281        &self,
282        candidates: Vec<EncodingCandidate>,
283        data: &[u8],
284    ) -> Result<EncodingInfo> {
285        if candidates.is_empty() {
286            return Ok(EncodingInfo {
287                charset: Charset::Unknown,
288                confidence: 0.0,
289                bom_detected: false,
290                sample_text: String::from("Unable to detect encoding"),
291            });
292        }
293        let best = &candidates[0];
294        if best.confidence < self.confidence_threshold {
295            let config = load_config()?;
296            return Ok(EncodingInfo {
297                charset: Charset::Utf8,
298                confidence: 0.5,
299                bom_detected: false,
300                sample_text: format!(
301                    "Using default encoding: {}",
302                    config.formats.default_encoding
303                ),
304            });
305        }
306        let sample = self.decode_sample(data, &best.charset)?;
307        Ok(EncodingInfo {
308            charset: best.charset.clone(),
309            confidence: best.confidence,
310            bom_detected: false,
311            sample_text: sample,
312        })
313    }
314
315    fn decode_sample(&self, data: &[u8], charset: &Charset) -> Result<String> {
316        let sample_size = data.len().min(200);
317        let sample_data = &data[0..sample_size];
318        match charset {
319            Charset::Utf8 => String::from_utf8(sample_data.to_vec())
320                .or_else(|_| Ok(String::from_utf8_lossy(sample_data).into_owned())),
321            _ => Ok(String::from_utf8_lossy(sample_data).into_owned()),
322        }
323    }
324
325    fn default_charsets() -> Vec<Charset> {
326        vec![
327            Charset::Utf8,
328            Charset::Gbk,
329            Charset::ShiftJis,
330            Charset::Big5,
331            Charset::Iso88591,
332            Charset::Windows1252,
333        ]
334    }
335}
336
337#[derive(Debug, Clone)]
338struct EncodingCandidate {
339    charset: Charset,
340    confidence: f32,
341}
342
343impl Default for EncodingDetector {
344    fn default() -> Self {
345        Self::new().unwrap_or(Self {
346            confidence_threshold: 0.7,
347            max_sample_size: 8192,
348            supported_charsets: Self::default_charsets(),
349        })
350    }
351}