1use crate::Result;
2use crate::core::formats::encoding::charset::{Charset, EncodingInfo};
3use anyhow::anyhow;
4use encoding_rs::{BIG5, Encoding, GBK, ISO_8859_2, SHIFT_JIS, UTF_8, WINDOWS_1252};
5use std::collections::HashMap;
6
7#[derive(Debug, Clone)]
12pub struct ConversionResult {
13 pub converted_text: String,
15 pub original_encoding: Charset,
17 pub target_encoding: Charset,
19 pub bytes_processed: usize,
21 pub had_errors: bool,
23 pub error_count: usize,
25}
26
27pub struct EncodingConverter {
29 encoding_map: HashMap<Charset, &'static Encoding>,
30}
31
32impl EncodingConverter {
33 pub fn new() -> Self {
35 let mut encoding_map = HashMap::new();
36 encoding_map.insert(Charset::Utf8, UTF_8);
37 encoding_map.insert(Charset::Gbk, GBK);
38 encoding_map.insert(Charset::ShiftJis, SHIFT_JIS);
39 encoding_map.insert(Charset::Big5, BIG5);
40 encoding_map.insert(Charset::Windows1252, WINDOWS_1252);
41 encoding_map.insert(Charset::Iso88591, ISO_8859_2);
42 Self { encoding_map }
43 }
44
45 pub fn convert_to_utf8(
47 &self,
48 data: &[u8],
49 source_encoding: &Charset,
50 ) -> Result<ConversionResult> {
51 if *source_encoding == Charset::Utf8 {
52 return Ok(ConversionResult {
53 converted_text: String::from_utf8_lossy(data).to_string(),
54 original_encoding: Charset::Utf8,
55 target_encoding: Charset::Utf8,
56 bytes_processed: data.len(),
57 had_errors: false,
58 error_count: 0,
59 });
60 }
61 let encoding = self
62 .encoding_map
63 .get(source_encoding)
64 .ok_or_else(|| anyhow!("Unsupported encoding: {:?}", source_encoding))?;
65 let (converted, _, had_errors) = encoding.decode(data);
66 let error_count = if had_errors {
67 self.count_replacement_chars(&converted)
68 } else {
69 0
70 };
71 Ok(ConversionResult {
72 converted_text: converted.into_owned(),
73 original_encoding: source_encoding.clone(),
74 target_encoding: Charset::Utf8,
75 bytes_processed: data.len(),
76 had_errors,
77 error_count,
78 })
79 }
80
81 pub fn convert_file_to_utf8(
83 &self,
84 file_path: &str,
85 encoding_info: &EncodingInfo,
86 ) -> Result<ConversionResult> {
87 crate::core::fs_util::check_file_size(
88 std::path::Path::new(file_path),
89 52_428_800,
90 "Subtitle",
91 )?;
92 let data = std::fs::read(file_path)?;
93 let slice = if encoding_info.bom_detected {
94 self.skip_bom(&data, &encoding_info.charset)
95 } else {
96 data.as_slice()
97 };
98 self.convert_to_utf8(slice, &encoding_info.charset)
99 }
100
101 fn skip_bom<'a>(&self, data: &'a [u8], charset: &Charset) -> &'a [u8] {
102 match charset {
103 Charset::Utf8 if data.starts_with(&[0xEF, 0xBB, 0xBF]) => &data[3..],
104 Charset::Utf16Le if data.starts_with(&[0xFF, 0xFE]) => &data[2..],
105 Charset::Utf16Be if data.starts_with(&[0xFE, 0xFF]) => &data[2..],
106 Charset::Utf32Le if data.starts_with(&[0xFF, 0xFE, 0x00, 0x00]) => &data[4..],
107 Charset::Utf32Be if data.starts_with(&[0x00, 0x00, 0xFE, 0xFF]) => &data[4..],
108 _ => data,
109 }
110 }
111
112 fn count_replacement_chars(&self, text: &str) -> usize {
113 text.chars().filter(|&c| c == '\u{FFFD}').count()
114 }
115
116 pub fn validate_conversion(&self, result: &ConversionResult) -> ValidationResult {
118 ValidationResult {
119 is_valid: !result.had_errors || result.error_count == 0,
120 confidence: if result.had_errors {
121 1.0 - result.error_count as f32 / result.converted_text.len() as f32
122 } else {
123 1.0
124 },
125 warnings: self.generate_warnings(result),
126 }
127 }
128
129 fn generate_warnings(&self, result: &ConversionResult) -> Vec<String> {
130 let mut warnings = Vec::new();
131 if result.had_errors {
132 warnings.push(format!(
133 "Encoding conversion had {} replacement characters",
134 result.error_count
135 ));
136 }
137 if result.error_count > result.bytes_processed / 10 {
138 warnings.push("High error rate detected - encoding may be incorrect".to_string());
139 }
140 warnings
141 }
142}
143
144#[derive(Debug, Clone)]
149pub struct ValidationResult {
150 pub is_valid: bool,
152 pub confidence: f32,
154 pub warnings: Vec<String>,
156}
157
158impl Default for EncodingConverter {
159 fn default() -> Self {
160 Self::new()
161 }
162}
163
164#[cfg(test)]
165mod tests {
166 use super::*;
167 use crate::core::formats::encoding::charset::{Charset, EncodingInfo};
168 use std::fs;
169 use tempfile::TempDir;
170
171 fn make_converter() -> EncodingConverter {
172 EncodingConverter::new()
173 }
174
175 fn make_encoding_info(charset: Charset, bom_detected: bool) -> EncodingInfo {
176 EncodingInfo {
177 charset,
178 confidence: 1.0,
179 bom_detected,
180 sample_text: String::new(),
181 }
182 }
183
184 #[test]
187 fn test_convert_to_utf8_utf8_passthrough_ascii() {
188 let converter = make_converter();
189 let text = "Hello, World!";
190 let result = converter
191 .convert_to_utf8(text.as_bytes(), &Charset::Utf8)
192 .unwrap();
193 assert_eq!(result.converted_text, text);
194 assert_eq!(result.original_encoding, Charset::Utf8);
195 assert_eq!(result.target_encoding, Charset::Utf8);
196 assert_eq!(result.bytes_processed, text.len());
197 assert!(!result.had_errors);
198 assert_eq!(result.error_count, 0);
199 }
200
201 #[test]
202 fn test_convert_to_utf8_utf8_passthrough_multibyte() {
203 let converter = make_converter();
204 let text = "測試文字 🌍";
205 let result = converter
206 .convert_to_utf8(text.as_bytes(), &Charset::Utf8)
207 .unwrap();
208 assert_eq!(result.converted_text, text);
209 assert_eq!(result.original_encoding, Charset::Utf8);
210 assert_eq!(result.bytes_processed, text.as_bytes().len());
211 assert!(!result.had_errors);
212 }
213
214 #[test]
215 fn test_convert_to_utf8_utf8_empty_bytes() {
216 let converter = make_converter();
217 let result = converter.convert_to_utf8(&[], &Charset::Utf8).unwrap();
218 assert_eq!(result.converted_text, "");
219 assert_eq!(result.bytes_processed, 0);
220 assert!(!result.had_errors);
221 assert_eq!(result.error_count, 0);
222 }
223
224 #[test]
227 fn test_convert_to_utf8_gbk() {
228 let converter = make_converter();
229 let gbk_bytes = vec![0xC4u8, 0xE3, 0xBA, 0xC3];
231 let result = converter
232 .convert_to_utf8(&gbk_bytes, &Charset::Gbk)
233 .unwrap();
234 assert_eq!(result.original_encoding, Charset::Gbk);
235 assert_eq!(result.target_encoding, Charset::Utf8);
236 assert_eq!(result.bytes_processed, gbk_bytes.len());
237 assert!(!result.had_errors);
238 assert!(result.converted_text.contains('你'));
239 }
240
241 #[test]
242 fn test_convert_to_utf8_gbk_empty() {
243 let converter = make_converter();
244 let result = converter.convert_to_utf8(&[], &Charset::Gbk).unwrap();
245 assert_eq!(result.converted_text, "");
246 assert_eq!(result.bytes_processed, 0);
247 assert!(!result.had_errors);
248 assert_eq!(result.error_count, 0);
249 }
250
251 #[test]
252 fn test_convert_to_utf8_windows1252() {
253 let converter = make_converter();
254 let bytes = vec![b'c', b'a', b'f', 0xE9u8];
256 let result = converter
257 .convert_to_utf8(&bytes, &Charset::Windows1252)
258 .unwrap();
259 assert_eq!(result.original_encoding, Charset::Windows1252);
260 assert_eq!(result.target_encoding, Charset::Utf8);
261 assert_eq!(result.bytes_processed, bytes.len());
262 assert!(result.converted_text.contains('é') || result.converted_text.contains('é'));
263 assert!(!result.had_errors);
264 }
265
266 #[test]
267 fn test_convert_to_utf8_shiftjis() {
268 let converter = make_converter();
269 let shiftjis_bytes = vec![0x83u8, 0x65, 0x83, 0x58, 0x83, 0x67];
271 let result = converter
272 .convert_to_utf8(&shiftjis_bytes, &Charset::ShiftJis)
273 .unwrap();
274 assert_eq!(result.original_encoding, Charset::ShiftJis);
275 assert_eq!(result.target_encoding, Charset::Utf8);
276 assert_eq!(result.bytes_processed, shiftjis_bytes.len());
277 assert!(!result.converted_text.is_empty());
278 }
279
280 #[test]
281 fn test_convert_to_utf8_big5() {
282 let converter = make_converter();
283 let big5_bytes = vec![0xA7u8, 0x41, 0xA6, 0x6E];
285 let result = converter
286 .convert_to_utf8(&big5_bytes, &Charset::Big5)
287 .unwrap();
288 assert_eq!(result.original_encoding, Charset::Big5);
289 assert_eq!(result.target_encoding, Charset::Utf8);
290 assert_eq!(result.bytes_processed, big5_bytes.len());
291 assert!(!result.converted_text.is_empty());
292 }
293
294 #[test]
295 fn test_convert_to_utf8_iso88591() {
296 let converter = make_converter();
297 let bytes = vec![b'H', b'e', b'l', b'l', b'o', 0xE0u8]; let result = converter
300 .convert_to_utf8(&bytes, &Charset::Iso88591)
301 .unwrap();
302 assert_eq!(result.original_encoding, Charset::Iso88591);
303 assert_eq!(result.target_encoding, Charset::Utf8);
304 assert_eq!(result.bytes_processed, bytes.len());
305 assert!(!result.converted_text.is_empty());
306 }
307
308 #[test]
311 fn test_convert_to_utf8_unknown_returns_error() {
312 let converter = make_converter();
313 let result = converter.convert_to_utf8(b"some data", &Charset::Unknown);
314 assert!(result.is_err());
315 assert!(
316 result
317 .unwrap_err()
318 .to_string()
319 .contains("Unsupported encoding")
320 );
321 }
322
323 #[test]
324 fn test_convert_to_utf8_utf16le_returns_error() {
325 let converter = make_converter();
326 let result = converter.convert_to_utf8(b"data", &Charset::Utf16Le);
327 assert!(result.is_err());
328 }
329
330 #[test]
331 fn test_convert_to_utf8_utf16be_returns_error() {
332 let converter = make_converter();
333 let result = converter.convert_to_utf8(b"data", &Charset::Utf16Be);
334 assert!(result.is_err());
335 }
336
337 #[test]
338 fn test_convert_to_utf8_utf32le_returns_error() {
339 let converter = make_converter();
340 let result = converter.convert_to_utf8(b"data", &Charset::Utf32Le);
341 assert!(result.is_err());
342 }
343
344 #[test]
345 fn test_convert_to_utf8_utf32be_returns_error() {
346 let converter = make_converter();
347 let result = converter.convert_to_utf8(b"data", &Charset::Utf32Be);
348 assert!(result.is_err());
349 }
350
351 #[test]
352 fn test_convert_to_utf8_euckr_returns_error() {
353 let converter = make_converter();
354 let result = converter.convert_to_utf8(b"data", &Charset::Euckr);
355 assert!(result.is_err());
356 assert!(
357 result
358 .unwrap_err()
359 .to_string()
360 .contains("Unsupported encoding")
361 );
362 }
363
364 #[test]
367 fn test_convert_to_utf8_invalid_gbk_triggers_replacement_chars() {
368 let converter = make_converter();
369 let invalid_gbk = vec![0x81u8, 0x20, 0x81, 0x20];
371 let result = converter
372 .convert_to_utf8(&invalid_gbk, &Charset::Gbk)
373 .unwrap();
374 if result.had_errors {
375 assert!(result.error_count > 0);
376 }
377 }
378
379 #[test]
382 fn test_convert_file_to_utf8_utf8_no_bom() {
383 let converter = make_converter();
384 let dir = TempDir::new().unwrap();
385 let path = dir.path().join("test.txt");
386 let content = "Hello, 世界!";
387 fs::write(&path, content).unwrap();
388 let info = make_encoding_info(Charset::Utf8, false);
389 let result = converter
390 .convert_file_to_utf8(path.to_str().unwrap(), &info)
391 .unwrap();
392 assert_eq!(result.converted_text, content);
393 assert!(!result.had_errors);
394 }
395
396 #[test]
397 fn test_convert_file_to_utf8_nonexistent_file() {
398 let converter = make_converter();
399 let info = make_encoding_info(Charset::Utf8, false);
400 let result = converter.convert_file_to_utf8("/nonexistent/path/does_not_exist.txt", &info);
401 assert!(result.is_err());
402 }
403
404 #[test]
405 fn test_convert_file_to_utf8_gbk_no_bom() {
406 let converter = make_converter();
407 let dir = TempDir::new().unwrap();
408 let path = dir.path().join("gbk.txt");
409 let gbk_bytes = vec![0xC4u8, 0xE3, 0xBA, 0xC3];
411 fs::write(&path, &gbk_bytes).unwrap();
412 let info = make_encoding_info(Charset::Gbk, false);
413 let result = converter
414 .convert_file_to_utf8(path.to_str().unwrap(), &info)
415 .unwrap();
416 assert_eq!(result.original_encoding, Charset::Gbk);
417 assert!(result.converted_text.contains('你'));
418 }
419
420 #[test]
423 fn test_convert_file_to_utf8_utf8_with_bom_stripped() {
424 let converter = make_converter();
425 let dir = TempDir::new().unwrap();
426 let path = dir.path().join("bom_utf8.txt");
427 let content = "Hello, World!";
428 let mut data = vec![0xEFu8, 0xBB, 0xBF]; data.extend_from_slice(content.as_bytes());
430 fs::write(&path, &data).unwrap();
431 let info = make_encoding_info(Charset::Utf8, true);
432 let result = converter
433 .convert_file_to_utf8(path.to_str().unwrap(), &info)
434 .unwrap();
435 assert_eq!(result.converted_text, content);
437 assert!(!result.had_errors);
438 }
439
440 #[test]
441 fn test_skip_bom_utf16le_exercised_then_fails() {
442 let converter = make_converter();
443 let dir = TempDir::new().unwrap();
444 let path = dir.path().join("utf16le.bin");
445 let mut data = vec![0xFFu8, 0xFE]; data.extend_from_slice(b"H\x00i\x00");
447 fs::write(&path, &data).unwrap();
448 let info = make_encoding_info(Charset::Utf16Le, true);
450 let result = converter.convert_file_to_utf8(path.to_str().unwrap(), &info);
451 assert!(result.is_err());
452 }
453
454 #[test]
455 fn test_skip_bom_utf16be_exercised_then_fails() {
456 let converter = make_converter();
457 let dir = TempDir::new().unwrap();
458 let path = dir.path().join("utf16be.bin");
459 let mut data = vec![0xFEu8, 0xFF]; data.extend_from_slice(b"\x00H\x00i");
461 fs::write(&path, &data).unwrap();
462 let info = make_encoding_info(Charset::Utf16Be, true);
463 let result = converter.convert_file_to_utf8(path.to_str().unwrap(), &info);
464 assert!(result.is_err());
465 }
466
467 #[test]
468 fn test_skip_bom_utf32le_exercised_then_fails() {
469 let converter = make_converter();
470 let dir = TempDir::new().unwrap();
471 let path = dir.path().join("utf32le.bin");
472 let mut data = vec![0xFFu8, 0xFE, 0x00, 0x00]; data.extend_from_slice(b"H\x00\x00\x00");
474 fs::write(&path, &data).unwrap();
475 let info = make_encoding_info(Charset::Utf32Le, true);
476 let result = converter.convert_file_to_utf8(path.to_str().unwrap(), &info);
477 assert!(result.is_err());
478 }
479
480 #[test]
481 fn test_skip_bom_utf32be_exercised_then_fails() {
482 let converter = make_converter();
483 let dir = TempDir::new().unwrap();
484 let path = dir.path().join("utf32be.bin");
485 let mut data = vec![0x00u8, 0x00, 0xFE, 0xFF]; data.extend_from_slice(b"\x00\x00\x00H");
487 fs::write(&path, &data).unwrap();
488 let info = make_encoding_info(Charset::Utf32Be, true);
489 let result = converter.convert_file_to_utf8(path.to_str().unwrap(), &info);
490 assert!(result.is_err());
491 }
492
493 #[test]
494 fn test_skip_bom_fallthrough_mismatched_bom_flag() {
495 let converter = make_converter();
497 let dir = TempDir::new().unwrap();
498 let path = dir.path().join("gbk_no_bom.txt");
499 let gbk_bytes = vec![0xC4u8, 0xE3, 0xBA, 0xC3];
500 fs::write(&path, &gbk_bytes).unwrap();
501 let info = make_encoding_info(Charset::Gbk, true);
502 let result = converter
503 .convert_file_to_utf8(path.to_str().unwrap(), &info)
504 .unwrap();
505 assert!(result.converted_text.contains('你'));
506 }
507
508 #[test]
509 fn test_skip_bom_utf8_charset_but_no_bom_bytes() {
510 let converter = make_converter();
512 let dir = TempDir::new().unwrap();
513 let path = dir.path().join("utf8_nobom.txt");
514 let content = "Just text";
515 fs::write(&path, content).unwrap();
516 let info = make_encoding_info(Charset::Utf8, true);
517 let result = converter
518 .convert_file_to_utf8(path.to_str().unwrap(), &info)
519 .unwrap();
520 assert_eq!(result.converted_text, content);
521 }
522
523 #[test]
526 fn test_validate_conversion_no_errors() {
527 let converter = make_converter();
528 let result = ConversionResult {
529 converted_text: "Hello World".to_string(),
530 original_encoding: Charset::Utf8,
531 target_encoding: Charset::Utf8,
532 bytes_processed: 11,
533 had_errors: false,
534 error_count: 0,
535 };
536 let validation = converter.validate_conversion(&result);
537 assert!(validation.is_valid);
538 assert_eq!(validation.confidence, 1.0);
539 assert!(validation.warnings.is_empty());
540 }
541
542 #[test]
543 fn test_validate_conversion_had_errors_zero_count_still_valid() {
544 let converter = make_converter();
546 let result = ConversionResult {
547 converted_text: "Hello World".to_string(),
548 original_encoding: Charset::Gbk,
549 target_encoding: Charset::Utf8,
550 bytes_processed: 11,
551 had_errors: true,
552 error_count: 0,
553 };
554 let validation = converter.validate_conversion(&result);
555 assert!(validation.is_valid);
556 assert_eq!(validation.confidence, 1.0);
558 assert_eq!(validation.warnings.len(), 1);
560 assert!(validation.warnings[0].contains("replacement characters"));
561 }
562
563 #[test]
564 fn test_validate_conversion_with_replacement_errors() {
565 let converter = make_converter();
566 let result = ConversionResult {
567 converted_text: "Hello\u{FFFD}World".to_string(),
568 original_encoding: Charset::Windows1252,
569 target_encoding: Charset::Utf8,
570 bytes_processed: 11,
571 had_errors: true,
572 error_count: 1,
573 };
574 let validation = converter.validate_conversion(&result);
575 assert!(!validation.is_valid);
577 assert!(validation.confidence < 1.0);
578 assert!(!validation.warnings.is_empty());
579 assert!(validation.warnings[0].contains("replacement characters"));
580 }
581
582 #[test]
583 fn test_validate_conversion_high_error_rate_warning() {
584 let converter = make_converter();
585 let result = ConversionResult {
587 converted_text: "\u{FFFD}\u{FFFD}\u{FFFD}AB".to_string(),
588 original_encoding: Charset::ShiftJis,
589 target_encoding: Charset::Utf8,
590 bytes_processed: 10,
591 had_errors: true,
592 error_count: 3,
593 };
594 let validation = converter.validate_conversion(&result);
595 assert!(!validation.is_valid);
596 assert!(validation.warnings.len() >= 2);
597 assert!(
598 validation
599 .warnings
600 .iter()
601 .any(|w| w.contains("High error rate"))
602 );
603 }
604
605 #[test]
608 fn test_encoding_converter_default_works() {
609 let converter = EncodingConverter::default();
610 let result = converter.convert_to_utf8(b"hello", &Charset::Utf8).unwrap();
611 assert_eq!(result.converted_text, "hello");
612 }
613}