1#[macro_use]
2extern crate lazy_static;
3
4mod badness;
5mod chardata;
6mod fixes;
7
8mod codecs;
9
10use std::borrow::Cow;
11use std::cmp::min;
12
13use badness::is_bad;
14use chardata::possible_encoding;
15use chardata::ALTERED_UTF8_RE;
16use chardata::CHARMAP_ENCODINGS;
17use codecs::sloppy;
18use codecs::sloppy::Codec;
19use codecs::sloppy::LATIN_1;
20use codecs::sloppy::WINDOWS_1252;
21use codecs::utf8_variants;
22use fixes::decode_inconsistent_utf8;
23use fixes::fix_c1_controls;
24use fixes::fix_character_width;
25use fixes::fix_latin_ligatures;
26use fixes::fix_line_breaks;
27use fixes::remove_control_chars;
28use fixes::remove_terminal_escapes;
29use fixes::replace_lossy_sequences;
30use fixes::restore_byte_a0;
31use fixes::uncurl_quotes;
32use fixes::unescape_html;
33use icu::normalizer::ComposingNormalizer;
34use icu::normalizer::DecomposingNormalizer;
35
36use crate::codecs::sloppy::CodecType;
37
38#[derive(Debug, Clone, Copy)]
39pub enum Normalization {
40 NFC,
41 NFKC,
42 NFD,
43 NFKD,
44}
45
46static MAX_ATTEMPTS: i32 = 16;
47
48#[derive(Debug, Clone, Copy)]
168pub struct TextFixerConfig {
169 pub unescape_html: Option<bool>,
170 pub remove_terminal_escapes: bool,
171 pub fix_encoding: bool,
172 pub restore_byte_a0: bool,
173 pub replace_lossy_sequences: bool,
174 pub decode_inconsistent_utf8: bool,
175 pub fix_c1_controls: bool,
176 pub fix_latin_ligatures: bool,
177 pub fix_character_width: bool,
178 pub uncurl_quotes: bool,
179 pub fix_line_breaks: bool,
180 pub remove_control_chars: bool,
181 pub normalization: Option<Normalization>,
182 pub max_decode_length: i32,
183}
184
185impl Default for TextFixerConfig {
186 fn default() -> Self {
187 Self {
188 unescape_html: None,
189 remove_terminal_escapes: true,
190 fix_encoding: true,
191 restore_byte_a0: true,
192 replace_lossy_sequences: true,
193 decode_inconsistent_utf8: true,
194 fix_c1_controls: true,
195 fix_latin_ligatures: true,
196 fix_character_width: true,
197 uncurl_quotes: true,
198 fix_line_breaks: true,
199 remove_control_chars: true,
200 normalization: Some(Normalization::NFC),
201 max_decode_length: 1_000_000,
202 }
203 }
204}
205
206pub fn fix_text(text: &str, config: Option<&TextFixerConfig>) -> String {
207 let mut config: TextFixerConfig = match config {
241 Some(config) => config.clone(),
242 None => TextFixerConfig::default(),
243 };
244
245 let mut out: Vec<String> = Vec::new();
246
247 let mut pos = 0;
248
249 while pos < text.len() {
250 let mut textbreak = match text[pos..].find("\n") {
251 Some(idx) => pos + idx + 1,
252 None => text.len(),
253 };
254
255 if (textbreak - pos) > config.max_decode_length as usize {
256 textbreak = min(pos + config.max_decode_length as usize, text.len());
257 }
258
259 let segment = &text[pos..textbreak];
260
261 if config.unescape_html.is_none() {
262 if segment.contains("<") {
263 config.unescape_html = Some(false);
264 }
265 }
266
267 let res = fix_and_explain(segment, false, Some(&config));
268 out.push(res.text);
269
270 pos = textbreak;
271 }
272
273 out.join("")
274}
275
276pub struct ExplanationStep {
290 pub transformation: String,
291}
292
293pub struct ExplainedText {
294 pub text: String,
295 pub steps: Option<Vec<ExplanationStep>>,
296}
297
298fn apply_step<'a, F>(
299 f: F,
300 text: &'a str,
301 step: ExplanationStep,
302 steps: &mut Option<Vec<ExplanationStep>>,
303) -> Cow<'a, str>
304where
305 F: Fn(&'a str) -> Cow<'a, str>,
306{
307 let res = f(text);
308 if res != text {
309 if let Some(s) = steps {
310 s.push(step);
311 }
312 }
313 res
314}
315
316pub fn fix_and_explain(
317 text: &str,
318 explain: bool,
319 config: Option<&TextFixerConfig>,
320) -> ExplainedText {
321 let mut text = text.to_string();
329 let config = match config {
330 Some(config) => config.clone(),
331 None => TextFixerConfig::default(),
332 };
333
334 let mut steps: Option<Vec<ExplanationStep>> = if explain { Some(Vec::new()) } else { None };
335
336 for _ in 0..MAX_ATTEMPTS {
337 let temp = unescape_html(&text);
338
339 let temp = if config.fix_encoding {
340 let encoding_fixed = fix_encoding_and_explain(&temp, explain, Some(&config));
341 if let Some(s) = &mut steps {
342 s.extend(encoding_fixed.steps.unwrap_or(Vec::new()));
343 }
344 encoding_fixed.text.into()
345 } else {
346 temp
347 };
348
349 let temp = if config.fix_c1_controls {
350 apply_step(
351 fix_c1_controls,
352 &temp,
353 ExplanationStep {
354 transformation: String::from("fix_c1_controls"),
355 },
356 &mut steps,
357 )
358 } else {
359 temp
360 };
361
362 let temp = if config.fix_latin_ligatures {
363 apply_step(
364 fix_latin_ligatures,
365 &temp,
366 ExplanationStep {
367 transformation: String::from("fix_latin_ligatures"),
368 },
369 &mut steps,
370 )
371 } else {
372 temp
373 };
374
375 let temp = if config.fix_character_width {
376 apply_step(
377 fix_character_width,
378 &temp,
379 ExplanationStep {
380 transformation: String::from("fix_character_width"),
381 },
382 &mut steps,
383 )
384 } else {
385 temp
386 };
387
388 let temp = if config.uncurl_quotes {
389 apply_step(
390 uncurl_quotes,
391 &temp,
392 ExplanationStep {
393 transformation: String::from("uncurl_quotes"),
394 },
395 &mut steps,
396 )
397 } else {
398 temp
399 };
400
401 let temp = if config.fix_line_breaks {
402 apply_step(
403 fix_line_breaks,
404 &temp,
405 ExplanationStep {
406 transformation: String::from("fix_line_breaks"),
407 },
408 &mut steps,
409 )
410 } else {
411 temp
412 };
413
414 let temp = if config.remove_terminal_escapes {
415 apply_step(
416 remove_terminal_escapes,
417 &temp,
418 ExplanationStep {
419 transformation: String::from("remove_terminal_escapes"),
420 },
421 &mut steps,
422 )
423 } else {
424 temp
425 };
426
427 let temp = if config.remove_control_chars {
428 apply_step(
429 remove_control_chars,
430 &temp,
431 ExplanationStep {
432 transformation: String::from("remove_control_chars"),
433 },
434 &mut steps,
435 )
436 } else {
437 temp
438 };
439
440 let temp = if let Some(normalization) = &config.normalization {
441 apply_step(
442 |t| match normalization {
443 Normalization::NFC => ComposingNormalizer::new_nfc().normalize(t).into(),
444 Normalization::NFD => DecomposingNormalizer::new_nfd().normalize(t).into(),
445 Normalization::NFKD => DecomposingNormalizer::new_nfkd().normalize(t).into(),
446 Normalization::NFKC => ComposingNormalizer::new_nfkc().normalize(t).into(),
447 },
448 &temp,
449 ExplanationStep {
450 transformation: String::from("normalize"),
451 },
452 &mut steps,
453 )
454 } else {
455 temp
456 };
457
458 if temp == text {
459 return ExplainedText {
460 text: text.into(),
461 steps,
462 };
463 }
464
465 text = temp.into();
466 }
467
468 ExplainedText { text, steps }
469}
470
471fn fix_encoding_and_explain(
472 text: &str,
473 explain: bool,
474 config: Option<&TextFixerConfig>,
475) -> ExplainedText {
476 let config = match config {
485 Some(config) => config.clone(),
486 None => TextFixerConfig::default(),
487 };
488
489 let mut prev_text = text.to_string();
490
491 let plan_so_far = if explain { Some(Vec::new()) } else { None };
492
493 for _ in 0..MAX_ATTEMPTS {
494 let new_text = _fix_encoding_one_step_and_explain(&prev_text, explain, &config);
495
496 if new_text.text == prev_text {
497 if let Some(mut plan) = plan_so_far {
498 plan.extend(new_text.steps.unwrap_or(Vec::new()));
499
500 return ExplainedText {
501 text: new_text.text,
502 steps: Some(plan),
503 };
504 }
505
506 return ExplainedText {
507 text: new_text.text,
508 steps: None,
509 };
510 }
511
512 prev_text = new_text.text;
513 }
514
515 ExplainedText {
516 text: prev_text,
517 steps: None,
518 }
519}
520
521fn _fix_encoding_one_step_and_explain(
522 text: &str,
523 explain: bool,
524 config: &TextFixerConfig,
525) -> ExplainedText {
526 let mut text = text.to_string();
527
528 if text.len() == 0 {
529 return ExplainedText { text, steps: None };
531 }
532
533 if possible_encoding(&text, sloppy::CodecType::Ascii) || !is_bad(&text) {
536 return ExplainedText { text, steps: None };
537 }
538
539 let mut possible_1byte_encodings = vec![];
543
544 for (codec_type, encoding) in CHARMAP_ENCODINGS.iter() {
548 if possible_encoding(&text, encoding.codec_type()) {
549 possible_1byte_encodings.push(codec_type);
550 let encoded_bytes = encoding.encode(&text);
551
552 if let Ok(mut encoded_bytes) = encoded_bytes {
555 let mut decoding = CodecType::Utf8;
556 let mut transcode_steps = if explain { Some(Vec::new()) } else { None };
557
558 if config.restore_byte_a0 && ALTERED_UTF8_RE.is_match(&encoded_bytes) {
561 let replaced_bytes = restore_byte_a0(&encoded_bytes);
562
563 if replaced_bytes != encoded_bytes {
564 if let Some(s) = &mut transcode_steps {
565 s.push(ExplanationStep {
566 transformation: String::from("restore_byte_a0"),
567 });
568 }
569 encoded_bytes = replaced_bytes;
570 }
571 }
572
573 if config.replace_lossy_sequences && encoding.name().starts_with("sloppy") {
575 let replaced_bytes = replace_lossy_sequences(&encoded_bytes);
576
577 if replaced_bytes != encoded_bytes {
578 if let Some(s) = &mut transcode_steps {
579 s.push(ExplanationStep {
580 transformation: String::from("replace_lossy_sequences"),
581 });
582 }
583 encoded_bytes = replaced_bytes;
584 }
585 }
586
587 if encoded_bytes.contains(&0xED) || encoded_bytes.contains(&0xC0) {
588 decoding = CodecType::Utf8Variant;
589 }
590
591 let steps = if explain {
592 Some(vec![
593 ExplanationStep {
594 transformation: format!("decode {:?}", decoding),
595 },
596 ExplanationStep {
597 transformation: format!("encode {}", encoding.name()),
598 },
599 ])
600 } else {
601 None
602 };
603
604 if decoding == CodecType::Utf8 {
605 let fixed = std::str::from_utf8(&encoded_bytes);
606
607 if let Ok(s) = fixed {
608 return ExplainedText {
609 text: s.to_string(),
610 steps,
611 };
612 } else {
613 continue;
614 }
615 } else if decoding == CodecType::Utf8Variant {
616 let fixed = utf8_variants::variant_decode(&encoded_bytes);
617
618 if let Ok(s) = fixed {
619 return ExplainedText {
620 text: s.to_string(),
621 steps,
622 };
623 } else {
624 continue;
625 }
626 }
627 }
628 }
629 }
630
631 if config.decode_inconsistent_utf8 {
633 let fixed = decode_inconsistent_utf8(&text);
634 if fixed != text {
635 text = fixed.into();
636 }
637 }
638
639 if possible_1byte_encodings.contains(&&sloppy::CodecType::Latin1) {
643 if possible_1byte_encodings.contains(&&sloppy::CodecType::SloppyWindows1252) {
644 return ExplainedText { text, steps: None };
647 } else {
648 let encoded = LATIN_1.encode(&text);
652 if let Ok(encoded) = encoded {
653 let fixed = WINDOWS_1252.decode(&encoded);
654 if fixed != text {
655 let steps = if explain {
656 Some(vec![
657 ExplanationStep {
658 transformation: String::from("encode latin-1"),
659 },
660 ExplanationStep {
661 transformation: String::from("decode windows-1252"),
662 },
663 ])
664 } else {
665 None
666 };
667
668 return ExplainedText { text: fixed, steps };
669 }
670 }
671 }
672 }
673
674 if config.fix_c1_controls {
676 let fixed = fix_c1_controls(&text);
677 let steps = if explain {
678 Some(vec![ExplanationStep {
679 transformation: String::from("fix_c1_controls"),
680 }])
681 } else {
682 None
683 };
684 return ExplainedText {
685 text: fixed.into(),
686 steps,
687 };
688 }
689
690 ExplainedText { text, steps: None }
697}
698
699#[cfg(test)]
700mod tests {
701 use super::fix_text;
702 use pretty_assertions::assert_eq;
703
704 #[test]
705 fn test_messy_language_names_czech() {
706 let original = "Čeština";
707 let expected = "Čeština";
708 let result = fix_text(original, None);
709 assert_eq!(result, expected);
710 }
711
712 #[test]
713 fn test_messy_language_names_gaelic() {
714 let original = "GÃ idhlig";
715 let expected = "Gàidhlig";
716 let result = fix_text(original, None);
717 assert_eq!(result, expected);
718 }
719
720 #[test]
721 fn test_messy_language_names_lithuanian() {
722 let original = "Lietuvių";
723 let expected = "Lietuvių";
724 let result = fix_text(original, None);
725 assert_eq!(result, expected);
726 }
727
728 #[test]
729 fn test_messy_language_names_slovak() {
730 let original = "Sloven�ina";
731 let expected = "Sloven�ina";
732 let result = fix_text(original, None);
733 assert_eq!(result, expected);
734 }
735
736 #[test]
737 fn test_messy_language_names_vietnamese() {
738 let original = "Tiếng Việt";
739 let expected = "Tiếng Việt";
740 let result = fix_text(original, None);
741 assert_eq!(result, expected);
742 }
743
744 #[test]
745 fn test_messy_language_names_greek() {
746 let original = "Ελληνικά";
747 let expected = "Ελληνικά";
748 let result = fix_text(original, None);
749 assert_eq!(result, expected);
750 }
751
752 #[test]
753 fn test_messy_language_names_bulgarian() {
754 let original = "българ�ки език";
755 let expected = "българ�ки език";
756 let result = fix_text(original, None);
757 assert_eq!(result, expected);
758 }
759
760 #[test]
761 fn test_messy_language_names_russian() {
762 let original = "Ру��кий";
763 let expected = "Ру��кий";
764 let result = fix_text(original, None);
765 assert_eq!(result, expected);
766 }
767
768 #[test]
769 fn test_messy_language_names_serbian_cyrillic() {
770 let original = "Cрп�ки [ћирилицом]";
771 let expected = "Cрп�ки [ћирилицом]";
772 let result = fix_text(original, None);
773 assert_eq!(result, expected);
774 }
775
776 #[test]
777 fn test_messy_language_names_hebrew() {
778 let original = "עברית";
779 let expected = "עברית";
780 let result = fix_text(original, None);
781 assert_eq!(result, expected);
782 }
783
784 #[test]
785 fn test_messy_language_names_russian_2() {
786 let original = "Ру��кий";
787 let expected = "Ру��кий";
788 let result = fix_text(original, None);
789 assert_eq!(result, expected);
790 }
791
792 #[test]
793 fn test_messy_language_names_hindi() {
794 let original = "हिन�दी";
795 let expected = "हिन�दी";
796 let result = fix_text(original, None);
797 assert_eq!(result, expected);
798 }
799
800 #[test]
801 fn test_messy_language_names_tamil() {
802 let original = "தமிழ�";
803 let expected = "தமிழ�";
804 let result = fix_text(original, None);
805 assert_eq!(result, expected);
806 }
807
808 #[test]
809 fn test_messy_language_names_thai() {
810 let original = "ภาษาไทย";
811 let expected = "ภาษาไทย";
812 let result = fix_text(original, None);
813 assert_eq!(result, expected);
814 }
815
816 #[test]
817 fn test_messy_language_names_simplified_chinese() {
818 let original = "ç®€ä½“ä¸æ–‡";
819 let expected = "简体中文";
820 let result = fix_text(original, None);
821 assert_eq!(result, expected);
822 }
823
824 #[test]
825 fn test_messy_language_names_traditional_chinese() {
826 let original = "æ£é«”䏿–‡";
827 let expected = "正體中文";
828 let result = fix_text(original, None);
829 assert_eq!(result, expected);
830 }
831
832 #[test]
833 fn test_messy_language_names_japanese() {
834 let original = "日本語";
835 let expected = "日本語";
836 let result = fix_text(original, None);
837 assert_eq!(result, expected);
838 }
839
840 #[test]
841 fn test_messy_language_names_korean() {
842 let original = "한êµì–´";
843 let expected = "한국어";
844 let result = fix_text(original, None);
845 assert_eq!(result, expected);
846 }
847
848 #[test]
849 fn test_low_codepoint_emoji() {
850 let original = "He's Justinâ¤";
851 let expected = "He's Justin❤";
852 let result = fix_text(original, None);
853 assert_eq!(result, expected);
854 }
855
856 #[test]
857 fn test_utf8_macroman_mix_up_about_smurfs() {
858 let original = "Le Schtroumpf Docteur conseille gâteaux et baies schtroumpfantes pour un régime équilibré.";
859 let expected = "Le Schtroumpf Docteur conseille gâteaux et baies schtroumpfantes pour un régime équilibré.";
860 let result = fix_text(original, None);
861 assert_eq!(result, expected);
862 }
863
864 #[test]
865 fn test_checkmark_that_almost_looks_okay_as_mojibake() {
866 let original = "✔ No problems";
867 let expected = "✔ No problems";
868 let result = fix_text(original, None);
869 assert_eq!(result, expected);
870 }
871
872 #[test]
873 fn test_utf8_windows_1251_russian_mixup_about_futbol() {
874 let original = "РґРѕСЂРѕРіРµ РР·-РїРѕРґ #футбол";
875 let expected = "дороге Из-под #футбол";
876 let result = fix_text(original, None);
877 assert_eq!(result, expected);
878 }
879
880 #[test]
881 fn test_latin1_windows_1252_mixup_in_german() {
882 let original = "Handwerk bringt dich überall hin: Von der YOU bis nach Monaco";
883 let expected = "\"Handwerk bringt dich überall hin\": Von der YOU bis nach Monaco";
884 let result = fix_text(original, None);
885 assert_eq!(result, expected);
886 }
887
888 #[test]
889 fn test_latin1_windows_1252_mixup_of_the_replacement_character() {
890 let original = "Some comments may be republished on the website or in the newspaper � email addresses will not be published.";
891 let expected = "Some comments may be republished on the website or in the newspaper � email addresses will not be published.";
892 let result = fix_text(original, None);
893 assert_eq!(result, expected);
894 }
895
896 #[test]
897 fn test_cesu8_windows_1252_emoji() {
898 let original = "Hi guys í ½í¸";
899 let expected = "Hi guys 😍";
900 let result = fix_text(original, None);
901 assert_eq!(result, expected);
902 }
903
904 #[test]
905 fn test_cesu8_latin1_emoji() {
906 let original = "hihi RT username: âºí ½í¸";
907 let expected = "hihi RT username: ☺😘";
908 let result = fix_text(original, None);
909 assert_eq!(result, expected);
910 }
911
912 #[test]
913 fn test_latin1_windows_1252_mixup_in_turkish() {
914 let original = "Beta Haber: Hırsızı Büyü Korkuttu";
915 let expected = "Beta Haber: Hırsızı Büyü Korkuttu";
916 let result = fix_text(original, None);
917 assert_eq!(result, expected);
918 }
919
920 #[test]
921 fn test_utf8_windows_1251_mixed_up_twice_in_russian() {
922 let original = "приятности. ❤";
923 let expected = "приятности. ❤";
924 let result = fix_text(original, None);
925 assert_eq!(result, expected);
926 }
927
928 #[test]
929 fn test_utf8_windows_1252_mixed_up_twice_in_malay() {
930 let original = "Kayanya laptopku error deh, soalnya tiap mau ngetik deket-deket kamu font yg keluar selalu Times New “ Romanceâ€Â.";
931 let expected = "Kayanya laptopku error deh, soalnya tiap mau ngetik deket-deket kamu font yg keluar selalu Times New \" Romance\".";
932 let result = fix_text(original, None);
933 assert_eq!(result, expected);
934 }
935
936 #[test]
937 fn test_utf8_windows_1252_mixed_up_twice_in_naming_iggy_pop() {
938 let original = "Iggy Pop (né Jim Osterberg)";
939 let expected = "Iggy Pop (né Jim Osterberg)";
940 let result = fix_text(original, None);
941 assert_eq!(result, expected);
942 }
943
944 #[test]
945 fn test_left_quote_is_utf8_right_quote_is_latin1_both_encoded_in_windows_1252() {
946 let original = "Direzione Pd, ok âsenza modifiche all'Italicum.";
947 let expected = "Direzione Pd, ok \"senza modifiche\" all'Italicum.";
948 let result = fix_text(original, None);
949 assert_eq!(result, expected);
950 }
951
952 #[test]
953 fn test_utf8_sloppy_windows_1252_mixed_up_twice_in_a_triumphant_emoticon() {
954 let original = "selamat berpuasa sob (à ¸‡'̀⌣'ÃŒÂ)à ¸‡";
955 let expected = "selamat berpuasa sob (ง'̀⌣'́)ง";
956 let result = fix_text(original, None);
957 assert_eq!(result, expected);
958 }
959
960 #[test]
961 fn test_utf8_windows_1252_mixed_up_three_times() {
962 let original = "The Mona Lisa doesn’t have eyebrows.";
963 let expected = "The Mona Lisa doesn't have eyebrows.";
964 let result = fix_text(original, None);
965 assert_eq!(result, expected);
966 }
967
968 #[test]
969 fn test_utf8_codepag_437_mixup_in_russian() {
970 let original = "#правильноепитание";
971 let expected = "#правильноепитание";
972 let result = fix_text(original, None);
973 assert_eq!(result, expected);
974 }
975
976 #[test]
977 fn test_utf8_windows_1252_mixup_in_french() {
978 let original = "Hôtel de Police";
979 let expected = "Hôtel de Police";
980 let result = fix_text(original, None);
981 assert_eq!(result, expected);
982 }
983
984 #[test]
985 fn test_utf8_windows_1250_mixup_in_french() {
986 let original = "Liège Avenue de l'Hôpital";
987 let expected = "Liège Avenue de l'Hôpital";
988 let result = fix_text(original, None);
989 assert_eq!(result, expected);
990 }
991
992 #[test]
993 fn test_utf8_windows_1252_mixup_in_vietnamese() {
994 let original = "Tại sao giá hạt sầu riêng lại lên giá?";
995 let expected = "Tại sao giá hạt sầu riêng lại lên giá?";
996 let result = fix_text(original, None);
997 assert_eq!(result, expected);
998 }
999
1000 #[test]
1001 fn test_negative_using_diaereses_as_quotation_marks_in_greek() {
1002 let original = "Η ¨ανατροφή¨ δυστυχώς από τους προπονητές";
1003 let expected = "Η ¨ανατροφή¨ δυστυχώς από τους προπονητές";
1004 let result = fix_text(original, None);
1005 assert_eq!(result, expected);
1006 }
1007
1008 #[test]
1009 fn test_science_mid_word_greek_letter_gets_fixed_correctly() {
1010 let original = "Humanized HLA-DR4.RagKO.IL2RγcKO.NOD (DRAG) mice sustain the complex vertebrate life cycle of Plasmodium falciparum malaria.";
1011 let expected = "Humanized HLA-DR4.RagKO.IL2RγcKO.NOD (DRAG) mice sustain the complex vertebrate life cycle of Plasmodium falciparum malaria.";
1012 let result = fix_text(original, None);
1013 assert_eq!(result, expected);
1014 }
1015
1016 #[test]
1017 fn test_negative_more_science_dont_fix_a_multiplication_symbol_in_quotes() {
1018 let original = "higher values (“+” and “×” curves) in the superficial region";
1019 let expected = "higher values (\"+\" and \"×\" curves) in the superficial region";
1020 let result = fix_text(original, None);
1021 assert_eq!(result, expected);
1022 }
1023
1024 #[test]
1025 fn test_for_goodness_sake_we_can_come_close_to_fixing_this_but_fail_in_the_last_step() {
1026 let original = "It�¢��s classic. It�¢��s epic. It�¢��s ELIZABETH BENNET for goodness�¢�� sake!";
1027 let expected =
1028 "It�¢��s classic. It�¢��s epic. It�¢��s ELIZABETH BENNET for goodness�¢�� sake!";
1029 let result = fix_text(original, None);
1030 assert_eq!(result, expected);
1031 }
1032
1033 #[test]
1034 fn test_lossy_utf8_windows_1250_mixup_in_spanish() {
1035 let original =
1036 "Europa, Asia, Ă�frica, Norte, AmĂ©rica Central y del Sur, Australia y OceanĂa";
1037 let expected =
1038 "Europa, Asia, �frica, Norte, América Central y del Sur, Australia y Oceanía";
1039 let result = fix_text(original, None);
1040 assert_eq!(result, expected);
1041 }
1042
1043 #[test]
1044 fn test_utf8_sloppy_windows_1250_mixup_in_english() {
1045 let original = "It was named „scars´ stones“ after the rock-climbers who got hurt while climbing on it.";
1046 let expected = "It was named \"scars´ stones\" after the rock-climbers who got hurt while climbing on it.";
1047 let result = fix_text(original, None);
1048 assert_eq!(result, expected);
1049 }
1050
1051 #[test]
1052 fn test_the_same_text_as_above_but_as_a_utf8_iso_8859_2_mixup() {
1053 let original = "It was named âscars´ stonesâ after the rock-climbers who got hurt while climbing on it.";
1054 let expected = "It was named \"scars´ stones\" after the rock-climbers who got hurt while climbing on it.";
1055 let result = fix_text(original, None);
1056 assert_eq!(result, expected);
1057 }
1058
1059 #[test]
1060 fn test_utf8_windows1252_mixup_in_mixed_french_and_arabic() {
1061 let original = "À tous mes frères et soeurs dans la syrienneté comme dans l’humanité, sans discrimination aucune, je vous souhaite bonne fête عيد سعيد.Que la paix, la liberté, l’égalité, la fraternité et la dignité soient avec vous.Pardonnez ce ton un peu ecclésiastique.";
1062 let expected = "À tous mes frères et soeurs dans la syrienneté comme dans l'humanité, sans discrimination aucune, je vous souhaite bonne fête عيد سعيد.Que la paix, la liberté, l'égalité, la fraternité et la dignité soient avec vous.Pardonnez ce ton un peu ecclésiastique.";
1063 let result = fix_text(original, None);
1064 assert_eq!(result, expected);
1065 }
1066
1067 #[test]
1068 fn test_utf8_sloppy_windows_1250_mixup_in_romanian() {
1069 let original = "vedere Ă®nceĹŁoĹźatÄ";
1070 let expected = "vedere înceţoşată";
1071 let result = fix_text(original, None);
1072 assert_eq!(result, expected);
1073 }
1074
1075 #[test]
1076 fn test_utf8_windows_1250_mixup_in_slovak() {
1077 let original = "NapĂšte nám !";
1078 let expected = "Napíšte nám !";
1079 let result = fix_text(original, None);
1080 assert_eq!(result, expected);
1081 }
1082
1083 #[test]
1084 fn test_utf8_windows_1252_mixup_in_spanish() {
1085 let original = "DOS AÑOS";
1086 let expected = "DOS AÑOS";
1087 let result = fix_text(original, None);
1088 assert_eq!(result, expected);
1089 }
1090
1091 #[test]
1092 fn test_utf8_windows_1252_followed_by_utf8_windows_1251() {
1093 let original =
1094 "a bigger-than-expected £5.8bn rights issue to satisfy the new banking regulator";
1095 let expected =
1096 "a bigger-than-expected £5.8bn rights issue to satisfy the new banking regulator";
1097 let result = fix_text(original, None);
1098 assert_eq!(result, expected);
1099 }
1100
1101 #[test]
1102 fn test_fancy_unicode_crossing_out_but_mojibaked() {
1103 let original = "hotel $49 $̶6̶3̶ updated 2018";
1104 let expected = "hotel $49 $̶6̶3̶ updated 2018";
1105 let result = fix_text(original, None);
1106 assert_eq!(result, expected);
1107 }
1108
1109 #[test]
1110 fn test_a_face_with_utf8_sloppy_windows_1252_mixed_up_twice() {
1111 let original = "ââ€â€™(⌣˛⌣)ââ€Å½";
1112 let expected = "┒(⌣˛⌣)┎";
1113 let result = fix_text(original, None);
1114 assert_eq!(result, expected);
1115 }
1116
1117 #[test]
1118 fn test_we_can_mostly_decode_the_face_above_when_we_lose_the_character_u009d() {
1119 let original = "�(⌣˛⌣)�";
1120 let expected = "�(⌣˛⌣)�";
1121 let result = fix_text(original, None);
1122 assert_eq!(result, expected);
1123 }
1124
1125 #[test]
1126 fn test_lossy_decoding_can_have_plain_ascii_question_marks_as_well() {
1127 let original = "The ICR has been upgraded to “bb+� from “bb�";
1128 let expected = "The ICR has been upgraded to \"bb+� from \"bb�";
1129 let result = fix_text(original, None);
1130 assert_eq!(result, expected);
1131 }
1132
1133 #[test]
1134 fn test_cesu8_latin_1_mixup_over_several_emoji() {
1135 let original =
1136 "I just figured out how to tweet emojis! â\u{009a}½í\u{00a0}½í¸\u{0080}í\u{00a0}½í¸\u{0081}í\u{00a0}½í¸\u{0082}í\u{00a0}½í¸\u{0086}í\u{00a0}½í¸\u{008e}í\u{00a0}½í¸\u{008e}í\u{00a0}½í¸\u{008e}í\u{00a0}½í¸\u{008e}";
1137 let expected = "I just figured out how to tweet emojis! ⚽😀😁😂😆😎😎😎😎";
1138 let result = fix_text(original, None);
1139 assert_eq!(result, expected);
1140 }
1141
1142 #[test]
1143 fn test_an_absolutely_hopeless_garble() {
1144 let original = "ã†â€™ãƒâ€ ã¢â‚¬â„¢ãƒæ’ã‚â¢ãƒâ¢ã¢â‚¬å¡ã‚â¬ãƒâ€šã‚â";
1145 let expected = "ã†â€™ãƒâ€ ã¢â'¬â\"¢ãƒæ'ã'â¢ãƒâ¢ã¢â'¬å¡ã'â¬ãƒâ€šã'â";
1146 let result = fix_text(original, None);
1147 assert_eq!(result, expected);
1148 }
1149
1150 #[test]
1151 fn test_inconsistent_utf8_latin1_mojibake() {
1152 let original =
1153 "Ecuadorâs âpurely political decision on Assangeâ is likely result of âUS pressureâ
";
1154 let expected =
1155 "Ecuador's 'purely political decision on Assange' is likely result of 'US pressure'…";
1156 let result = fix_text(original, None);
1157 assert_eq!(result, expected);
1158 }
1159
1160 #[test]
1161 fn test_inconsistent_utf8_latin1_mojibake_with_an_ellipsis_from_the_windows_1252_character_set()
1162 {
1163 let original =
1164 "Ecuadorâs âpurely political decision on Assangeâ is likely result of âUS pressureâ…";
1165 let expected =
1166 "Ecuador's 'purely political decision on Assange' is likely result of 'US pressure'…";
1167 let result = fix_text(original, None);
1168 assert_eq!(result, expected);
1169 }
1170
1171 #[test]
1172 fn test_inconsistent_mojibake_in_portuguese() {
1173 let original = "Campeonatos > III Divisão - Série F > Jornadas Classificação";
1174 let expected = "Campeonatos > III Divisão - Série F > Jornadas Classificação";
1175 let result = fix_text(original, None);
1176 assert_eq!(result, expected);
1177 }
1178
1179 #[test]
1180 fn test_handle_afrikaans_n_character() {
1181 let original = "ʼn Chloroplas is ʼn organel wat in fotosinterende plante voorkom.";
1182 let expected = "'n Chloroplas is 'n organel wat in fotosinterende plante voorkom.";
1183 let result = fix_text(original, None);
1184 assert_eq!(result, expected);
1185 }
1186
1187 #[test]
1188 fn test_handle_croatian_single_codepoint_digraphs() {
1189 let original = "izum „bootstrap load“ koji je korištenjem polisilicijskog sloja proizveo dovoljno dobre kondenzatore na čipu";
1190 let expected = "izum \"bootstrap load\" koji je korištenjem polisilicijskog sloja proizveo dovoljno dobre kondenzatore na čipu";
1191 let result = fix_text(original, None);
1192 assert_eq!(result, expected);
1193 }
1194
1195 #[test]
1196 fn test_a_with_an_acute_accent_in_isolation() {
1197 let original = "Nicolás";
1198 let expected = "Nicolás";
1199 let result = fix_text(original, None);
1200 assert_eq!(result, expected);
1201 }
1202
1203 #[test]
1204 fn test_sharp_s_in_isolation_via_macroman_encoding() {
1205 let original = "weiß";
1206 let expected = "weiß";
1207 let result = fix_text(original, None);
1208 assert_eq!(result, expected);
1209 }
1210
1211 #[test]
1212 fn test_negative_è_preceded_by_a_non_breaking_space_is_not_a_small_capital_y() {
1213 let original =
1214 "Con il corpo e lo spirito ammaccato, è come se nel cuore avessi un vetro conficcato.";
1215 let expected =
1216 "Con il corpo e lo spirito ammaccato, è come se nel cuore avessi un vetro conficcato.";
1217 let result = fix_text(original, None);
1218 assert_eq!(result, expected);
1219 }
1220
1221 #[test]
1222 fn test_negative_multiplication_sign_and_ellipsis() {
1223 let original = "4288×…";
1224 let expected = "4288×…";
1225 let result = fix_text(original, None);
1226 assert_eq!(result, expected);
1227 }
1228
1229 #[test]
1230 fn test_negative_accents_are_sometimes_used_as_quotes() {
1231 let original = "``toda produzida pronta pra assa aí´´";
1232 let expected = "``toda produzida pronta pra assa aí´´";
1233 let result = fix_text(original, None);
1234 assert_eq!(result, expected);
1235 }
1236
1237 #[test]
1238 fn test_negative_õ_followed_by_an_ellipsis() {
1239 let original = "HUHLL Õ…";
1240 let expected = "HUHLL Õ…";
1241 let result = fix_text(original, None);
1242 assert_eq!(result, expected);
1243 }
1244
1245 #[test]
1246 fn test_negative_ê_followed_by_an_ellipsis() {
1247 let original = "RETWEET SE VOCÊ…";
1248 let expected = "RETWEET SE VOCÊ…";
1249 let result = fix_text(original, None);
1250 assert_eq!(result, expected);
1251 }
1252
1253 #[test]
1254 fn test_negative_é_followed_by_an_ellipsis() {
1255 let original = "PARCE QUE SUR LEURS PLAQUES IL Y MARQUÉ…";
1256 let expected = "PARCE QUE SUR LEURS PLAQUES IL Y MARQUÉ…";
1257 let result = fix_text(original, None);
1258 assert_eq!(result, expected);
1259 }
1260
1261 #[test]
1262 fn test_negative_ó_followed_by_an_ellipsis() {
1263 let original = "TEM QUE SEGUIR, SDV SÓ…";
1264 let expected = "TEM QUE SEGUIR, SDV SÓ…";
1265 let result = fix_text(original, None);
1266 assert_eq!(result, expected);
1267 }
1268
1269 #[test]
1270 fn test_negative_é_followed_by_a_curly_apostrophe() {
1271 let original = "Join ZZAJÉ’s Official Fan List and receive news, events, and more!";
1272 let expected = "Join ZZAJÉ's Official Fan List and receive news, events, and more!";
1273 let result = fix_text(original, None);
1274 assert_eq!(result, expected);
1275 }
1276
1277 #[test]
1278 fn test_negative_é_preceded_by_curly_apostrophe() {
1279 let original = "L’épisode 8 est trop fou ouahh";
1280 let expected = "L'épisode 8 est trop fou ouahh";
1281 let result = fix_text(original, None);
1282 assert_eq!(result, expected);
1283 }
1284
1285 #[test]
1286 fn test_negative_three_raised_eyebrows_or_something() {
1287 let original = "Ôôô VIDA MINHA";
1288 let expected = "Ôôô VIDA MINHA";
1289 let result = fix_text(original, None);
1290 assert_eq!(result, expected);
1291 }
1292
1293 #[test]
1294 fn test_negative_copyright_sign_preceded_by_non_breaking_space() {
1295 let original = "[x] ©";
1296 let expected = "[x] ©";
1297 let result = fix_text(original, None);
1298 assert_eq!(result, expected);
1299 }
1300
1301 #[test]
1302 fn test_negative_en_dash_and_infinity_sign() {
1303 let original = "2012—∞";
1304 let expected = "2012—∞";
1305 let result = fix_text(original, None);
1306 assert_eq!(result, expected);
1307 }
1308
1309 #[test]
1310 fn test_negative_this_e_is_a_ukrainian_letter_but_nothing_else_is_wrong() {
1311 let original = "SENSЕ - Oleg Tsedryk";
1312 let expected = "SENSЕ - Oleg Tsedryk";
1313 let result = fix_text(original, None);
1314 assert_eq!(result, expected);
1315 }
1316
1317 #[test]
1318 fn test_negative_angry_face() {
1319 let original = "OK??:( `¬´ ):";
1320 let expected = "OK??:( `¬´ ):";
1321 let result = fix_text(original, None);
1322 assert_eq!(result, expected);
1323 }
1324
1325 #[test]
1326 fn test_negative_synthetic_face_with_glasses_and_a_raised_eyebrow() {
1327 let original = "( o¬ô )";
1328 let expected = "( o¬ô )";
1329 let result = fix_text(original, None);
1330 assert_eq!(result, expected);
1331 }
1332
1333 #[test]
1334 fn test_negative_triangle_and_degree_sign() {
1335 let original = "∆°";
1336 let expected = "∆°";
1337 let result = fix_text(original, None);
1338 assert_eq!(result, expected);
1339 }
1340
1341 #[test]
1342 fn test_negative_portuguese_with_inverted_question_mark() {
1343 let original = "ESSE CARA AI QUEM É¿";
1344 let expected = "ESSE CARA AI QUEM É¿";
1345 let result = fix_text(original, None);
1346 assert_eq!(result, expected);
1347 }
1348
1349 #[test]
1350 fn test_negative_portuguese_with_acute_accents_as_quotation_marks() {
1351 let original = "``hogwarts nao existe, voce nao vai pegar o trem pra lá´´";
1352 let expected = "``hogwarts nao existe, voce nao vai pegar o trem pra lá´´";
1353 let result = fix_text(original, None);
1354 assert_eq!(result, expected);
1355 }
1356
1357 #[test]
1358 fn test_negative_finnish_ä_followed_by_a_non_breaking_space() {
1359 let original = "SELKÄ EDELLÄ MAAHAN via @YouTube";
1360 let expected = "SELKÄ EDELLÄ MAAHAN via @YouTube";
1361 let result = fix_text(original, None);
1362 assert_eq!(result, expected);
1363 }
1364
1365 #[test]
1366 fn test_negative_multiplying_by_currency() {
1367 let original = "Offering 5×£35 pin ups";
1368 let expected = "Offering 5×£35 pin ups";
1369 let result = fix_text(original, None);
1370 assert_eq!(result, expected);
1371 }
1372
1373 #[test]
1374 fn test_negative_registered_chocolate_brand_name() {
1375 let original = "NESTLÉ® requiere contratar personal para diferentes areas a nivel nacional e internacional";
1376 let expected = "NESTLÉ® requiere contratar personal para diferentes areas a nivel nacional e internacional";
1377 let result = fix_text(original, None);
1378 assert_eq!(result, expected);
1379 }
1380
1381 #[test]
1382 fn test_mostly_negative_we_only_need_to_fix_c1_control_characters() {
1383 let original = "C'est vrai que nous n'en avons pas encore beaucoup parlé
Tu sais, ça fait de nombreuses années";
1384 let expected = "C'est vrai que nous n'en avons pas encore beaucoup parlé… Tu sais, ça fait de nombreuses années";
1385 let result = fix_text(original, None);
1386 assert_eq!(result, expected);
1387 }
1388
1389 #[test]
1390 fn test_french_example_containing_non_breaking_spaces() {
1391 let original = "ART TRIP Ã l'office de tourisme";
1392 let expected = "ART TRIP à l'office de tourisme";
1393 let result = fix_text(original, None);
1394 assert_eq!(result, expected);
1395 }
1396
1397 #[test]
1398 fn test_english_example_in_utf8_windows_1251_with_a_ligature() {
1399 let original = "This is significantly lower than the respective share";
1400 let expected = "This is significantly lower than the respective share";
1401 let result = fix_text(original, None);
1402 assert_eq!(result, expected);
1403 }
1404
1405 #[test]
1406 fn test_synthetic_we_can_recognize_ã_in_some_cases_when_its_the_only_mojibake() {
1407 let original = "voilà le travail";
1408 let expected = "voilà le travail";
1409 let result = fix_text(original, None);
1410 assert_eq!(result, expected);
1411 }
1412
1413 #[test]
1414 fn test_synthetic_we_can_recognize_ã_at_the_end_of_a_word_when_it_absorbs_a_following_space() {
1415 let original = "voilà le travail";
1416 let expected = "voilà le travail";
1417 let result = fix_text(original, None);
1418 assert_eq!(result, expected);
1419 }
1420
1421 #[test]
1422 fn test_negative_we_dont_fix_ã_in_all_contexts() {
1423 let original = "C O N C L U S Ã O";
1424 let expected = "C O N C L U S Ã O";
1425 let result = fix_text(original, None);
1426 assert_eq!(result, expected);
1427 }
1428
1429 #[test]
1430 fn test_à_remains_its_own_word_even_if_spaces_after_it_get_coalesced_into_one() {
1431 let original = "à perturber la réflexion des théologiens jusqu'à nos jours";
1432 let expected = "à perturber la réflexion des théologiens jusqu'à nos jours";
1433 let result = fix_text(original, None);
1434 assert_eq!(result, expected);
1435 }
1436
1437 #[test]
1438 fn test_fix_à_in_inconsistent_mojibake() {
1439 let original = "Le barème forfaitaire permet l’évaluation des frais de déplacement relatifs à l’utilisation";
1440 let expected = "Le barème forfaitaire permet l'évaluation des frais de déplacement relatifs à l'utilisation";
1441 let result = fix_text(original, None);
1442 assert_eq!(result, expected);
1443 }
1444
1445 #[test]
1446 fn test_the_portuguese_word_às_does_not_become_à_s_due_to_the_french_fix() {
1447 let original = "com especial atenção às crianças";
1448 let expected = "com especial atenção às crianças";
1449 let result = fix_text(original, None);
1450 assert_eq!(result, expected);
1451 }
1452
1453 #[test]
1454 fn test_this_is_why_we_require_a_space_after_the_s_in_às() {
1455 let original = "Troisième édition pour ce festival qui persiste et signe à s'éloigner des grands axes pour prendre les contre-allées en 16 concerts dans 7 villes de 2 pays voisins.";
1456 let expected = "Troisième édition pour ce festival qui persiste et signe à s'éloigner des grands axes pour prendre les contre-allées en 16 concerts dans 7 villes de 2 pays voisins.";
1457 let result = fix_text(original, None);
1458 assert_eq!(result, expected);
1459 }
1460
1461 #[test]
1462 fn test_we_can_fix_à_in_windows_1251_sometimes_as_well() {
1463 let original = "La région de Dnepropetrovsk se trouve à l’ouest de l’Ukraine";
1464 let expected = "La région de Dnepropetrovsk se trouve à l'ouest de l'Ukraine";
1465 let result = fix_text(original, None);
1466 assert_eq!(result, expected);
1467 }
1468
1469 #[test]
1470 fn test_ã_quele_is_the_portuguese_word_àquele_not_à_quele() {
1471 let original = "eliminado o antÃgeno e mantidos os nÃveis de anticorpos, surgem as condições necessárias ao estabelecimento do granuloma, semelhante à quele observado nas lesões por imunocomplexo em excesso de anticorpos";
1472 let expected = "eliminado o antígeno e mantidos os níveis de anticorpos, surgem as condições necessárias ao estabelecimento do granuloma, semelhante àquele observado nas lesões por imunocomplexo em excesso de anticorpos";
1473 let result = fix_text(original, None);
1474 assert_eq!(result, expected);
1475 }
1476
1477 #[test]
1478 fn test_a_complex_lossy_pile_up_of_mojibake_in_portuguese() {
1479 let original = "⠀ � Regulamento: ⠀ ⚠� As pessoas que marcarem nos comentários perfis empresariais e/ou de marcas, personalidades ou fake serão desclassificadas. ⚠� Podem participar pessoas residentes em Petrolina/PE ou Juazeiro/BA, desde que se comprometam a retirar o prêmio em nosso endereço. Funcionários estão vetados. ⚠� Serão válidos os comentários postados até 16h, do dia 31/03/2018. E o resultado será divulgado até às 19h do mesmo dia em uma nova publicação em nosso instagram. ⠀ Boa sorte!!! 😀�";
1480 let expected = "⠀ � Regulamento: ⠀ ⚠� As pessoas que marcarem nos comentários perfis empresariais e/ou de marcas, personalidades ou fake serão desclassificadas. ⚠� Podem participar pessoas residentes em Petrolina/PE ou Juazeiro/BA, desde que se comprometam a retirar o prêmio em nosso endereço. Funcionários estão vetados. ⚠� Serão válidos os comentários postados até 16h, do dia 31/03/2018. E o resultado será divulgado até às 19h do mesmo dia em uma nova publicação em nosso instagram. ⠀ Boa sorte!!! 😀�";
1481 let result = fix_text(original, None);
1482 assert_eq!(result, expected);
1483 }
1484
1485 #[test]
1486 fn test_utf8_windows_1252_mixup_in_gaelic_involving_non_breaking_spaces() {
1487 let original = "CÃ nan nan GÃ idheal";
1488 let expected = "Cànan nan Gàidheal";
1489 let result = fix_text(original, None);
1490 assert_eq!(result, expected);
1491 }
1492
1493 #[test]
1494 fn test_misleading_mix_up_in_spanish() {
1495 let original = "tiene demora y está \u{0093}próximo a resolverse\u{0094}";
1496 let expected = "tiene demora y está \"próximo a resolverse\"";
1497 let result = fix_text(original, None);
1498 assert_eq!(result, expected);
1499 }
1500
1501 #[test]
1502 fn test_punctuation_pile_up_should_actually_be_musical_notes() {
1503 let original = "Engkau masih yg terindah, indah di dalam hatiku♫~";
1504 let expected = "Engkau masih yg terindah, indah di dalam hatiku♫~";
1505 let result = fix_text(original, None);
1506 assert_eq!(result, expected);
1507 }
1508
1509 #[test]
1510 fn test_utf8_windows_1251_mixup_in_tweet_spam() {
1511 let original = "Blog Traffic Tip 2 – Broadcast Email Your Blog";
1512 let expected = "Blog Traffic Tip 2 – Broadcast Email Your Blog";
1513 let result = fix_text(original, None);
1514 assert_eq!(result, expected);
1515 }
1516
1517 #[test]
1518 fn test_utf8_windows_1251_mixup() {
1519 let original = "S&P Confirms Ukrsotsbank’s “B-“ Rating";
1520 let expected = "S&P Confirms Ukrsotsbank's \"B-\" Rating";
1521 let result = fix_text(original, None);
1522 assert_eq!(result, expected);
1523 }
1524
1525 #[test]
1526 fn test_dutch_example_with_ë() {
1527 let original = "ongeëvenaard";
1528 let expected = "ongeëvenaard";
1529 let result = fix_text(original, None);
1530 assert_eq!(result, expected);
1531 }
1532
1533 #[test]
1534 fn test_negative_indonesian_leetspeak() {
1535 let original = "MÄ£ÄM ÌÑÌ Q £ÄGÌ GÄLÄW ÑÍCH SÖÄ£ ÑÝÄ $ÚÄMÌ Q £ÄGÌ GÄK ÉÑÄK BÄDÄÑ....?????????, ......JÄDÍ...";
1536 let expected = "MÄ£ÄM ÌÑÌ Q £ÄGÌ GÄLÄW ÑÍCH SÖÄ£ ÑÝÄ $ÚÄMÌ Q £ÄGÌ GÄK ÉÑÄK BÄDÄÑ....?????????, ......JÄDÍ...";
1537 let result = fix_text(original, None);
1538 assert_eq!(result, expected);
1539 }
1540
1541 #[test]
1542 fn test_three_layers_of_utf8_macroman_mixup_in_french() {
1543 let original = "Merci de télécharger le plug-in Flash Player 8";
1544 let expected = "Merci de télécharger le plug-in Flash Player 8";
1545 let result = fix_text(original, None);
1546 assert_eq!(result, expected);
1547 }
1548
1549 #[test]
1550 fn test_utf8_macroman_mixup_in_french() {
1551 let original = "Merci de bien vouloir activiter le Javascript dans votre navigateur web afin d'en profiter…";
1552 let expected = "Merci de bien vouloir activiter le Javascript dans votre navigateur web afin d'en profiter…";
1553 let result = fix_text(original, None);
1554 assert_eq!(result, expected);
1555 }
1556
1557 #[test]
1558 fn test_italian_utf8_macroman_example_with_ò() {
1559 let original = "Le Vigne di Zamò";
1560 let expected = "Le Vigne di Zamò";
1561 let result = fix_text(original, None);
1562 assert_eq!(result, expected);
1563 }
1564
1565 #[test]
1566 fn test_hebrew_utf8_windows_1252_mojibake() {
1567 let original = "בהודעה";
1568 let expected = "בהודעה";
1569 let result = fix_text(original, None);
1570 assert_eq!(result, expected);
1571 }
1572
1573 #[test]
1574 fn test_synthetic_hebrew_utf8_windows_1250_mojibake() {
1575 let original = "בהודעה";
1576 let expected = "בהודעה";
1577 let result = fix_text(original, None);
1578 assert_eq!(result, expected);
1579 }
1580
1581 #[test]
1582 fn test_synthetic_hebrew_utf8_macroman_mojibake() {
1583 let original = "בהודעה";
1584 let expected = "בהודעה";
1585 let result = fix_text(original, None);
1586 assert_eq!(result, expected);
1587 }
1588
1589 #[test]
1590 fn test_synthetic_hebrew_utf8_latin1_mojibake() {
1591 let original = "××××";
1592 let expected = "אבבא";
1593 let result = fix_text(original, None);
1594 assert_eq!(result, expected);
1595 }
1596
1597 #[test]
1598 fn test_synthetic_arabic_utf8_windows_1252_mojibake() {
1599 let original = "رسالة";
1600 let expected = "رسالة";
1601 let result = fix_text(original, None);
1602 assert_eq!(result, expected);
1603 }
1604
1605 #[test]
1606 fn test_synthetic_arabic_utf8_windows_1250_mojibake() {
1607 let original = "رسالة";
1608 let expected = "رسالة";
1609 let result = fix_text(original, None);
1610 assert_eq!(result, expected);
1611 }
1612
1613 #[test]
1614 fn test_synthetic_arabic_utf8_macroman_mojibake() {
1615 let original = "رسالة";
1616 let expected = "رسالة";
1617 let result = fix_text(original, None);
1618 assert_eq!(result, expected);
1619 }
1620
1621 #[test]
1622 fn test_negative_math_in_unicode() {
1623 let original = "(-1/2)! = √π";
1624 let expected = "(-1/2)! = √π";
1625 let result = fix_text(original, None);
1626 assert_eq!(result, expected);
1627 }
1628
1629 #[test]
1630 fn test_negative_leet_line_art() {
1631 let original = "├┤a┼┐a┼┐a┼┐a┼┐a";
1632 let expected = "├┤a┼┐a┼┐a┼┐a┼┐a";
1633 let result = fix_text(original, None);
1634 assert_eq!(result, expected);
1635 }
1636
1637 #[test]
1638 fn test_synthetic_negative_brontës_name_does_not_end_with_a_korean_syllable() {
1639 let original = "I'm not such a fan of Charlotte Brontë…”";
1640 let expected = "I'm not such a fan of Charlotte Brontë…\"";
1641 let result = fix_text(original, None);
1642 assert_eq!(result, expected);
1643 }
1644
1645 #[test]
1646 fn test_synthetic_negative_hypothetical_swedish_product_name() {
1647 let original = "AHÅ™, the new sofa from IKEA";
1648 let expected = "AHÅ™, the new sofa from IKEA";
1649 let result = fix_text(original, None);
1650 assert_eq!(result, expected);
1651 }
1652
1653 #[test]
1654 fn test_synthetic_negative_ukrainian_capital_letters() {
1655 let original = "ВІКІ is Ukrainian for WIKI";
1656 let expected = "ВІКІ is Ukrainian for WIKI";
1657 let result = fix_text(original, None);
1658 assert_eq!(result, expected);
1659 }
1660
1661 #[test]
1662 fn test_synthetic_negative_dont_leak_our_internal_use_of_byte_0x1a() {
1663 let original = "These control characters \u{001a} are apparently intentional \u{0081}";
1664 let expected = "These control characters are apparently intentional \u{0081}";
1665 let result = fix_text(original, None);
1666 assert_eq!(result, expected);
1667 }
1668
1669 #[test]
1670 fn test_synthetic_negative_u1a_on_its_own() {
1671 let original = "Here's a control character: ";
1672 let expected = "Here's a control character: ";
1673 let result = fix_text(original, None);
1674 assert_eq!(result, expected);
1675 }
1676
1677 #[test]
1678 fn test_synthetic_negative_a_with_circle_as_an_angstrom_sign() {
1679 let original = "a radius of 10 Å—";
1680 let expected = "a radius of 10 Å—";
1681 let result = fix_text(original, None);
1682 assert_eq!(result, expected);
1683 }
1684
1685 #[test]
1686 fn test_synthetic_negative_spanish_with_exclamation_points_on_the_wrong_sides() {
1687 let original = "!YO SÉ¡";
1688 let expected = "!YO SÉ¡";
1689 let result = fix_text(original, None);
1690 assert_eq!(result, expected);
1691 }
1692
1693 #[test]
1694 fn test_synthetic_fix_text_with_backslashes_in_it() {
1695 let original = "<40% vs â¥40%";
1696 let expected = "<40% vs ≥40%";
1697 let result = fix_text(original, None);
1698 assert_eq!(result, expected);
1699 }
1700
1701 #[test]
1702 fn test_synthetic_curly_quotes_with_mismatched_encoding_glitches_in_latin1() {
1703 let original = "âmismatched quotes
";
1704
1705 let expected = "\"mismatched quotes…\"";
1706 let result = fix_text(original, None);
1707 assert_eq!(result, expected);
1708 }
1709
1710 #[test]
1711 fn test_synthetic_curly_quotes_with_mismatched_encoding_glitches_in_windows_1252() {
1712 let original = "“mismatched quotes…”";
1713 let expected = "\"mismatched quotes…\"";
1714 let result = fix_text(original, None);
1715 assert_eq!(result, expected);
1716 }
1717
1718 #[test]
1719 fn test_synthetic_lossy_decoding_in_sloppy_windows_1252() {
1720 let original = "“lossy decoding�";
1721 let expected = "\"lossy decoding�";
1722 let result = fix_text(original, None);
1723 assert_eq!(result, expected);
1724 }
1725
1726 #[test]
1727 fn test_synthetic_french_word_for_august_in_windows_1252() {
1728 let original = "août";
1729 let expected = "août";
1730 let result = fix_text(original, None);
1731 assert_eq!(result, expected);
1732 }
1733
1734 #[test]
1735 fn test_synthetic_french_word_for_hotel_in_all_caps_windows_1252() {
1736 let original = "HÔTEL";
1737 let expected = "HÔTEL";
1738 let result = fix_text(original, None);
1739 assert_eq!(result, expected);
1740 }
1741
1742 #[test]
1743 fn test_synthetic_scottish_gaelic_word_for_subject_in_all_caps_windows_1252() {
1744 let original = "CÙIS";
1745 let expected = "CÙIS";
1746 let result = fix_text(original, None);
1747 assert_eq!(result, expected);
1748 }
1749
1750 #[test]
1751 fn test_synthetic_negative_romanian_word_before_a_non_breaking_space() {
1752 let original = "NICIODATĂ ";
1753 let expected = "NICIODATĂ ";
1754 let result = fix_text(original, None);
1755 assert_eq!(result, expected);
1756 }
1757
1758 #[test]
1759 fn test_synthetic_negative_be_careful_around_curly_apostrophes() {
1760 let original = "There are a lot of Ã’s in mojibake text";
1761 let expected = "There are a lot of Ã's in mojibake text";
1762 let result = fix_text(original, None);
1763 assert_eq!(result, expected);
1764 }
1765
1766 #[test]
1767 fn test_synthetic_negative_romanian_word_before_a_trademark_sign() {
1768 let original = "NICIODATĂ™";
1769 let expected = "NICIODATĂ™";
1770 let result = fix_text(original, None);
1771 assert_eq!(result, expected);
1772 }
1773
1774 #[test]
1775 fn test_synthetic_negative_camel_cased_serbian_that_looks_like_a_utf8_windows_1251_mixup() {
1776 let original = "ПоздравЂаво";
1777 let expected = "ПоздравЂаво";
1778 let result = fix_text(original, None);
1779 assert_eq!(result, expected);
1780 }
1781
1782 #[test]
1783 fn test_synthetic_mojibake_with_trademark_sign_at_the_end_of_a_word() {
1784 let original = "OÙ ET QUAND?";
1785 let expected = "OÙ ET QUAND?";
1786 let result = fix_text(original, None);
1787 assert_eq!(result, expected);
1788 }
1789}