subx_cli/core/formats/encoding/
converter.rs

1use crate::Result;
2use crate::core::formats::encoding::charset::{Charset, EncodingInfo};
3use anyhow::anyhow;
4use encoding_rs::{BIG5, Encoding, GBK, ISO_8859_2, SHIFT_JIS, UTF_8, WINDOWS_1252};
5use std::collections::HashMap;
6
7/// Result of an encoding conversion operation.
8///
9/// Contains the converted text along with metadata about the conversion
10/// process, including error information and encoding details.
11#[derive(Debug, Clone)]
12pub struct ConversionResult {
13    /// The converted text in the target encoding
14    pub converted_text: String,
15    /// The original character encoding that was detected
16    pub original_encoding: Charset,
17    /// The target encoding for conversion
18    pub target_encoding: Charset,
19    /// Number of bytes processed during conversion
20    pub bytes_processed: usize,
21    /// Whether any errors occurred during conversion
22    pub had_errors: bool,
23    /// Total number of conversion errors encountered
24    pub error_count: usize,
25}
26
27/// Encoding converter
28pub struct EncodingConverter {
29    encoding_map: HashMap<Charset, &'static Encoding>,
30}
31
32impl EncodingConverter {
33    /// Create converter and initialize encoding mapping
34    pub fn new() -> Self {
35        let mut encoding_map = HashMap::new();
36        encoding_map.insert(Charset::Utf8, UTF_8);
37        encoding_map.insert(Charset::Gbk, GBK);
38        encoding_map.insert(Charset::ShiftJis, SHIFT_JIS);
39        encoding_map.insert(Charset::Big5, BIG5);
40        encoding_map.insert(Charset::Windows1252, WINDOWS_1252);
41        encoding_map.insert(Charset::Iso88591, ISO_8859_2);
42        Self { encoding_map }
43    }
44
45    /// Convert data to UTF-8
46    pub fn convert_to_utf8(
47        &self,
48        data: &[u8],
49        source_encoding: &Charset,
50    ) -> Result<ConversionResult> {
51        if *source_encoding == Charset::Utf8 {
52            return Ok(ConversionResult {
53                converted_text: String::from_utf8_lossy(data).to_string(),
54                original_encoding: Charset::Utf8,
55                target_encoding: Charset::Utf8,
56                bytes_processed: data.len(),
57                had_errors: false,
58                error_count: 0,
59            });
60        }
61        let encoding = self
62            .encoding_map
63            .get(source_encoding)
64            .ok_or_else(|| anyhow!("Unsupported encoding: {:?}", source_encoding))?;
65        let (converted, _, had_errors) = encoding.decode(data);
66        let error_count = if had_errors {
67            self.count_replacement_chars(&converted)
68        } else {
69            0
70        };
71        Ok(ConversionResult {
72            converted_text: converted.into_owned(),
73            original_encoding: source_encoding.clone(),
74            target_encoding: Charset::Utf8,
75            bytes_processed: data.len(),
76            had_errors,
77            error_count,
78        })
79    }
80
81    /// Convert file content to UTF-8
82    pub fn convert_file_to_utf8(
83        &self,
84        file_path: &str,
85        encoding_info: &EncodingInfo,
86    ) -> Result<ConversionResult> {
87        let data = std::fs::read(file_path)?;
88        let slice = if encoding_info.bom_detected {
89            self.skip_bom(&data, &encoding_info.charset)
90        } else {
91            data.as_slice()
92        };
93        self.convert_to_utf8(slice, &encoding_info.charset)
94    }
95
96    fn skip_bom<'a>(&self, data: &'a [u8], charset: &Charset) -> &'a [u8] {
97        match charset {
98            Charset::Utf8 if data.starts_with(&[0xEF, 0xBB, 0xBF]) => &data[3..],
99            Charset::Utf16Le if data.starts_with(&[0xFF, 0xFE]) => &data[2..],
100            Charset::Utf16Be if data.starts_with(&[0xFE, 0xFF]) => &data[2..],
101            Charset::Utf32Le if data.starts_with(&[0xFF, 0xFE, 0x00, 0x00]) => &data[4..],
102            Charset::Utf32Be if data.starts_with(&[0x00, 0x00, 0xFE, 0xFF]) => &data[4..],
103            _ => data,
104        }
105    }
106
107    fn count_replacement_chars(&self, text: &str) -> usize {
108        text.chars().filter(|&c| c == '\u{FFFD}').count()
109    }
110
111    /// Validate conversion result
112    pub fn validate_conversion(&self, result: &ConversionResult) -> ValidationResult {
113        ValidationResult {
114            is_valid: !result.had_errors || result.error_count == 0,
115            confidence: if result.had_errors {
116                1.0 - result.error_count as f32 / result.converted_text.len() as f32
117            } else {
118                1.0
119            },
120            warnings: self.generate_warnings(result),
121        }
122    }
123
124    fn generate_warnings(&self, result: &ConversionResult) -> Vec<String> {
125        let mut warnings = Vec::new();
126        if result.had_errors {
127            warnings.push(format!(
128                "Encoding conversion had {} replacement characters",
129                result.error_count
130            ));
131        }
132        if result.error_count > result.bytes_processed / 10 {
133            warnings.push("High error rate detected - encoding may be incorrect".to_string());
134        }
135        warnings
136    }
137}
138
139/// Result of encoding validation process.
140///
141/// Contains validation status, confidence level, and any warnings
142/// about potential encoding issues.
143#[derive(Debug, Clone)]
144pub struct ValidationResult {
145    /// Whether the encoding validation passed
146    pub is_valid: bool,
147    /// Confidence level in the validation result (0.0 to 1.0)
148    pub confidence: f32,
149    /// List of validation warnings
150    pub warnings: Vec<String>,
151}
152
153impl Default for EncodingConverter {
154    fn default() -> Self {
155        Self::new()
156    }
157}