plsfix/
lib.rs

1#[macro_use]
2extern crate lazy_static;
3
4mod badness;
5mod chardata;
6mod fixes;
7
8mod codecs;
9
10use std::borrow::Cow;
11use std::cmp::min;
12
13use badness::is_bad;
14use chardata::possible_encoding;
15use chardata::ALTERED_UTF8_RE;
16use chardata::CHARMAP_ENCODINGS;
17use codecs::sloppy;
18use codecs::sloppy::Codec;
19use codecs::sloppy::LATIN_1;
20use codecs::sloppy::WINDOWS_1252;
21use codecs::utf8_variants;
22use fixes::decode_inconsistent_utf8;
23use fixes::fix_c1_controls;
24use fixes::fix_character_width;
25use fixes::fix_latin_ligatures;
26use fixes::fix_line_breaks;
27use fixes::remove_control_chars;
28use fixes::remove_terminal_escapes;
29use fixes::replace_lossy_sequences;
30use fixes::restore_byte_a0;
31use fixes::uncurl_quotes;
32use fixes::unescape_html;
33use icu::normalizer::ComposingNormalizer;
34use icu::normalizer::DecomposingNormalizer;
35
36use crate::codecs::sloppy::CodecType;
37
38#[derive(Debug, Clone, Copy)]
39pub enum Normalization {
40    NFC,
41    NFKC,
42    NFD,
43    NFKD,
44}
45
46static MAX_ATTEMPTS: i32 = 16;
47
48/*
49A TextFixerConfig object stores configuration options for plsfix.
50
51It's implemented as a namedtuple with defaults, so you can instantiate
52it by providing the values to change from their defaults as keyword arguments.
53For example, to disable 'unescape_html' and keep the rest of the defaults::
54
55    TextFixerConfig(unescape_html=False)
56
57Here are the options and their default values:
58
59- `unescape_html`: "auto"
60
61    Configures whether to replace HTML entities such as &amp; with the character
62    they represent. "auto" says to do this by default, but disable it when a
63    literal < character appears, indicating that the input is actual HTML and
64    entities should be preserved. The value can be True, to always enable this
65    fixer, or False, to always disable it.
66
67- `remove_terminal_escapes`: True
68
69    Removes "ANSI" terminal escapes, such as for changing the color of text in a
70    terminal window.
71
72- `fix_encoding`: True
73
74    Detect mojibake and attempt to fix it by decoding the text in a different
75    encoding standard.
76
77    The following four options affect `fix_encoding` works, and do nothing if
78    `fix_encoding` is False:
79
80    - `restore_byte_a0`: True
81
82    Allow a literal space (U+20) to be interpreted as a non-breaking space
83    (U+A0) when that would make it part of a fixable mojibake string.
84
85    Because spaces are very common characters, this could lead to false
86    positives, but we try to apply it only when there's strong evidence for
87    mojibake. Disabling `restore_byte_a0` is safer from false positives,
88    but creates false negatives.
89
90    - `replace_lossy_sequences`: True
91
92    Detect mojibake that has been partially replaced by the characters
93    '�' or '?'. If the mojibake could be decoded otherwise, replace the
94    detected sequence with '�'.
95
96    - `decode_inconsistent_utf8`: True
97
98    When we see sequences that distinctly look like UTF-8 mojibake, but
99    there's no consistent way to reinterpret the string in a new encoding,
100    replace the mojibake with the appropriate UTF-8 characters anyway.
101
102    This helps to decode strings that are concatenated from different
103    encodings.
104
105    - `fix_c1_controls`: True
106
107    Replace C1 control characters (the useless characters U+80 - U+9B that
108    come from Latin-1) with their Windows-1252 equivalents, like HTML5 does,
109    even if the whole string doesn't decode as Latin-1.
110
111- `fix_latin_ligatures`: True
112
113    Replace common Latin-alphabet ligatures, such as ``ﬁ``, with the
114    letters they're made of.
115
116- `fix_character_width`: True
117
118    Replace fullwidth Latin characters and halfwidth Katakana with
119    their more standard widths.
120
121- `uncurl_quotes`: True
122
123    Replace curly quotes with straight quotes.
124
125- `fix_line_breaks`: True
126
127    Replace various forms of line breaks with the standard Unix line
128    break, ``\n``.
129
130- `fix_surrogates`: True
131
132    Replace sequences of UTF-16 surrogate codepoints with the character
133    they were meant to encode. This fixes text that was decoded with the
134    obsolete UCS-2 standard, and allows it to support high-numbered
135    codepoints such as emoji.
136
137- `remove_control_chars`: True
138
139    Remove certain control characters that have no displayed effect on text.
140
141- `normalization`: "NFC"
142
143    Choose what kind of Unicode normalization is applied. Usually, we apply
144    NFC normalization, so that letters followed by combining characters become
145    single combined characters.
146
147    Changing this to "NFKC" applies more compatibility conversions, such as
148    replacing the 'micro sign' with a standard Greek lowercase mu, which looks
149    identical. However, some NFKC normalizations change the meaning of text,
150    such as converting "10³" to "103".
151
152`normalization` can be None, to apply no normalization.
153
154- `max_decode_length`: 1_000_000
155
156    The maximum size of "segment" that plsfix will try to fix all at once.
157
158- `explain`: True
159
160    Whether to compute 'explanations', lists describing what plsfix changed.
161    When this is False, the explanation will be None, and the code that
162    builds the explanation will be skipped, possibly saving time.
163
164    Functions that accept TextFixerConfig and don't return an explanation
165    will automatically set `explain` to False.
166*/
167#[derive(Debug, Clone, Copy)]
168pub struct TextFixerConfig {
169    pub unescape_html: Option<bool>,
170    pub remove_terminal_escapes: bool,
171    pub fix_encoding: bool,
172    pub restore_byte_a0: bool,
173    pub replace_lossy_sequences: bool,
174    pub decode_inconsistent_utf8: bool,
175    pub fix_c1_controls: bool,
176    pub fix_latin_ligatures: bool,
177    pub fix_character_width: bool,
178    pub uncurl_quotes: bool,
179    pub fix_line_breaks: bool,
180    pub remove_control_chars: bool,
181    pub normalization: Option<Normalization>,
182    pub max_decode_length: i32,
183}
184
185impl Default for TextFixerConfig {
186    fn default() -> Self {
187        Self {
188            unescape_html: None,
189            remove_terminal_escapes: true,
190            fix_encoding: true,
191            restore_byte_a0: true,
192            replace_lossy_sequences: true,
193            decode_inconsistent_utf8: true,
194            fix_c1_controls: true,
195            fix_latin_ligatures: true,
196            fix_character_width: true,
197            uncurl_quotes: true,
198            fix_line_breaks: true,
199            remove_control_chars: true,
200            normalization: Some(Normalization::NFC),
201            max_decode_length: 1_000_000,
202        }
203    }
204}
205
206pub fn fix_text(text: &str, config: Option<&TextFixerConfig>) -> String {
207    /*
208    Given Unicode text as input, fix inconsistencies and glitches in it,
209    such as mojibake (text that was decoded in the wrong encoding).
210
211    plsfix applies a number of different fixes to the text, and can accept
212    configuration to select which fixes to apply.
213
214    For convenience and backward compatibility, the configuration can also
215    take the form of keyword arguments, which will set the equivalently-named
216    fields of the TextFixerConfig object.
217
218    For example, here are two ways to fix text but skip the "uncurl_quotes"
219    step::
220
221        fix_text(text, TextFixerConfig(uncurl_quotes=False))
222        fix_text(text, uncurl_quotes=False)
223
224    This function fixes text in independent segments, which are usually lines
225    of text, or arbitrarily broken up every 1 million codepoints (configurable
226    with `config.max_decode_length`) if there aren't enough line breaks. The
227    bound on segment lengths helps to avoid unbounded slowdowns.
228
229    plsfix can also provide an 'explanation', a list of transformations it applied
230    to the text that would fix more text like it. This function doesn't provide
231    explanations (because there may be different fixes for different segments
232    of text).
233
234    To get an explanation, use the :func:`fix_and_explain()` function, which
235    fixes the string in one segment and explains what it fixed.
236     */
237    // let default_config = TextFixerConfig::default();
238    // let mut config = config.unwrap_or(&default_config).clone();
239
240    let mut config: TextFixerConfig = match config {
241        Some(config) => config.clone(),
242        None => TextFixerConfig::default(),
243    };
244
245    let mut out: Vec<String> = Vec::new();
246
247    let mut pos = 0;
248
249    while pos < text.len() {
250        let mut textbreak = match text[pos..].find("\n") {
251            Some(idx) => pos + idx + 1,
252            None => text.len(),
253        };
254
255        if (textbreak - pos) > config.max_decode_length as usize {
256            textbreak = min(pos + config.max_decode_length as usize, text.len());
257        }
258
259        let segment = &text[pos..textbreak];
260
261        if config.unescape_html.is_none() {
262            if segment.contains("<") {
263                config.unescape_html = Some(false);
264            }
265        }
266
267        let res = fix_and_explain(segment, false, Some(&config));
268        out.push(res.text);
269
270        pos = textbreak;
271    }
272
273    out.join("")
274}
275
276/*
277A step in an ExplainedText, explaining how to decode text.
278
279The possible actions are:
280
281- "encode": take in a string and encode it as bytes, with the given encoding
282- "decode": take in bytes and decode them as a string, with the given encoding
283- "transcode": convert bytes to bytes with a particular named function
284- "apply": convert str to str with a particular named function
285
286The `parameter` is the name of the encoding or function to use. If it's a
287function, it must appear in the FIXERS dictionary.
288*/
289pub struct ExplanationStep {
290    pub transformation: String,
291}
292
293pub struct ExplainedText {
294    pub text: String,
295    pub steps: Option<Vec<ExplanationStep>>,
296}
297
298fn apply_step<'a, F>(
299    f: F,
300    text: &'a str,
301    step: ExplanationStep,
302    steps: &mut Option<Vec<ExplanationStep>>,
303) -> Cow<'a, str>
304where
305    F: Fn(&'a str) -> Cow<'a, str>,
306{
307    let res = f(text);
308    if res != text {
309        if let Some(s) = steps {
310            s.push(step);
311        }
312    }
313    res
314}
315
316pub fn fix_and_explain(
317    text: &str,
318    explain: bool,
319    config: Option<&TextFixerConfig>,
320) -> ExplainedText {
321    /*
322    Fix text as a single segment, returning the fixed text and an explanation
323    of what was fixed.
324
325    The explanation is a list of steps that can be applied with
326    :func:`apply_plan`, or if config.explain is False, it will be None.
327    */
328    let mut text = text.to_string();
329    let config = match config {
330        Some(config) => config.clone(),
331        None => TextFixerConfig::default(),
332    };
333
334    let mut steps: Option<Vec<ExplanationStep>> = if explain { Some(Vec::new()) } else { None };
335
336    for _ in 0..MAX_ATTEMPTS {
337        let temp = unescape_html(&text);
338
339        let temp = if config.fix_encoding {
340            let encoding_fixed = fix_encoding_and_explain(&temp, explain, Some(&config));
341            if let Some(s) = &mut steps {
342                s.extend(encoding_fixed.steps.unwrap_or(Vec::new()));
343            }
344            encoding_fixed.text.into()
345        } else {
346            temp
347        };
348
349        let temp = if config.fix_c1_controls {
350            apply_step(
351                fix_c1_controls,
352                &temp,
353                ExplanationStep {
354                    transformation: String::from("fix_c1_controls"),
355                },
356                &mut steps,
357            )
358        } else {
359            temp
360        };
361
362        let temp = if config.fix_latin_ligatures {
363            apply_step(
364                fix_latin_ligatures,
365                &temp,
366                ExplanationStep {
367                    transformation: String::from("fix_latin_ligatures"),
368                },
369                &mut steps,
370            )
371        } else {
372            temp
373        };
374
375        let temp = if config.fix_character_width {
376            apply_step(
377                fix_character_width,
378                &temp,
379                ExplanationStep {
380                    transformation: String::from("fix_character_width"),
381                },
382                &mut steps,
383            )
384        } else {
385            temp
386        };
387
388        let temp = if config.uncurl_quotes {
389            apply_step(
390                uncurl_quotes,
391                &temp,
392                ExplanationStep {
393                    transformation: String::from("uncurl_quotes"),
394                },
395                &mut steps,
396            )
397        } else {
398            temp
399        };
400
401        let temp = if config.fix_line_breaks {
402            apply_step(
403                fix_line_breaks,
404                &temp,
405                ExplanationStep {
406                    transformation: String::from("fix_line_breaks"),
407                },
408                &mut steps,
409            )
410        } else {
411            temp
412        };
413
414        let temp = if config.remove_terminal_escapes {
415            apply_step(
416                remove_terminal_escapes,
417                &temp,
418                ExplanationStep {
419                    transformation: String::from("remove_terminal_escapes"),
420                },
421                &mut steps,
422            )
423        } else {
424            temp
425        };
426
427        let temp = if config.remove_control_chars {
428            apply_step(
429                remove_control_chars,
430                &temp,
431                ExplanationStep {
432                    transformation: String::from("remove_control_chars"),
433                },
434                &mut steps,
435            )
436        } else {
437            temp
438        };
439
440        let temp = if let Some(normalization) = &config.normalization {
441            apply_step(
442                |t| match normalization {
443                    Normalization::NFC => ComposingNormalizer::new_nfc().normalize(t).into(),
444                    Normalization::NFD => DecomposingNormalizer::new_nfd().normalize(t).into(),
445                    Normalization::NFKD => DecomposingNormalizer::new_nfkd().normalize(t).into(),
446                    Normalization::NFKC => ComposingNormalizer::new_nfkc().normalize(t).into(),
447                },
448                &temp,
449                ExplanationStep {
450                    transformation: String::from("normalize"),
451                },
452                &mut steps,
453            )
454        } else {
455            temp
456        };
457
458        if temp == text {
459            return ExplainedText {
460                text: text.into(),
461                steps,
462            };
463        }
464
465        text = temp.into();
466    }
467
468    ExplainedText { text, steps }
469}
470
471fn fix_encoding_and_explain(
472    text: &str,
473    explain: bool,
474    config: Option<&TextFixerConfig>,
475) -> ExplainedText {
476    /*
477    Apply the steps of plsfix that detect mojibake and fix it. Returns the fixed
478    text and a list explaining what was fixed.
479
480    This includes fixing text by encoding and decoding it in different encodings,
481    as well as the subordinate fixes `restore_byte_a0`, `replace_lossy_sequences`,
482    `decode_inconsistent_utf8`, and `fix_c1_controls`.
483    */
484    let config = match config {
485        Some(config) => config.clone(),
486        None => TextFixerConfig::default(),
487    };
488
489    let mut prev_text = text.to_string();
490
491    let plan_so_far = if explain { Some(Vec::new()) } else { None };
492
493    for _ in 0..MAX_ATTEMPTS {
494        let new_text = _fix_encoding_one_step_and_explain(&prev_text, explain, &config);
495
496        if new_text.text == prev_text {
497            if let Some(mut plan) = plan_so_far {
498                plan.extend(new_text.steps.unwrap_or(Vec::new()));
499
500                return ExplainedText {
501                    text: new_text.text,
502                    steps: Some(plan),
503                };
504            }
505
506            return ExplainedText {
507                text: new_text.text,
508                steps: None,
509            };
510        }
511
512        prev_text = new_text.text;
513    }
514
515    ExplainedText {
516        text: prev_text,
517        steps: None,
518    }
519}
520
521fn _fix_encoding_one_step_and_explain(
522    text: &str,
523    explain: bool,
524    config: &TextFixerConfig,
525) -> ExplainedText {
526    let mut text = text.to_string();
527
528    if text.len() == 0 {
529        // return text;
530        return ExplainedText { text, steps: None };
531    }
532
533    // The first plan is to return ASCII text unchanged, as well as text
534    // that doesn't look like it contains mojibake
535    if possible_encoding(&text, sloppy::CodecType::Ascii) || !is_bad(&text) {
536        return ExplainedText { text, steps: None };
537    }
538
539    // As we go through the next step, remember the possible encodings
540    // that we encounter but don't successfully fix yet. We may need them
541    // later.
542    let mut possible_1byte_encodings = vec![];
543
544    // Suppose the text was supposed to be UTF-8, but it was decoded using
545    // a single-byte encoding instead. When these cases can be fixed, they
546    // are usually the correct thing to do, so try them next.
547    for (codec_type, encoding) in CHARMAP_ENCODINGS.iter() {
548        if possible_encoding(&text, encoding.codec_type()) {
549            possible_1byte_encodings.push(codec_type);
550            let encoded_bytes = encoding.encode(&text);
551
552            // Now, find out if it's UTF-8 (or close enough). Otherwise,
553            // remember the encoding for later.
554            if let Ok(mut encoded_bytes) = encoded_bytes {
555                let mut decoding = CodecType::Utf8;
556                let mut transcode_steps = if explain { Some(Vec::new()) } else { None };
557
558                // Check encoded_bytes for sequences that would be UTF-8,
559                // except they have b' ' where b'\xa0' would belong.
560                if config.restore_byte_a0 && ALTERED_UTF8_RE.is_match(&encoded_bytes) {
561                    let replaced_bytes = restore_byte_a0(&encoded_bytes);
562
563                    if replaced_bytes != encoded_bytes {
564                        if let Some(s) = &mut transcode_steps {
565                            s.push(ExplanationStep {
566                                transformation: String::from("restore_byte_a0"),
567                            });
568                        }
569                        encoded_bytes = replaced_bytes;
570                    }
571                }
572
573                // Replace sequences where information has been lost
574                if config.replace_lossy_sequences && encoding.name().starts_with("sloppy") {
575                    let replaced_bytes = replace_lossy_sequences(&encoded_bytes);
576
577                    if replaced_bytes != encoded_bytes {
578                        if let Some(s) = &mut transcode_steps {
579                            s.push(ExplanationStep {
580                                transformation: String::from("replace_lossy_sequences"),
581                            });
582                        }
583                        encoded_bytes = replaced_bytes;
584                    }
585                }
586
587                if encoded_bytes.contains(&0xED) || encoded_bytes.contains(&0xC0) {
588                    decoding = CodecType::Utf8Variant;
589                }
590
591                let steps = if explain {
592                    Some(vec![
593                        ExplanationStep {
594                            transformation: format!("decode {:?}", decoding),
595                        },
596                        ExplanationStep {
597                            transformation: format!("encode {}", encoding.name()),
598                        },
599                    ])
600                } else {
601                    None
602                };
603
604                if decoding == CodecType::Utf8 {
605                    let fixed = std::str::from_utf8(&encoded_bytes);
606
607                    if let Ok(s) = fixed {
608                        return ExplainedText {
609                            text: s.to_string(),
610                            steps,
611                        };
612                    } else {
613                        continue;
614                    }
615                } else if decoding == CodecType::Utf8Variant {
616                    let fixed = utf8_variants::variant_decode(&encoded_bytes);
617
618                    if let Ok(s) = fixed {
619                        return ExplainedText {
620                            text: s.to_string(),
621                            steps,
622                        };
623                    } else {
624                        continue;
625                    }
626                }
627            }
628        }
629    }
630
631    // Look for a-hat-euro sequences that remain, and fix them in isolation.
632    if config.decode_inconsistent_utf8 {
633        let fixed = decode_inconsistent_utf8(&text);
634        if fixed != text {
635            text = fixed.into();
636        }
637    }
638
639    // The next most likely case is that this is Latin-1 that was intended to
640    // be read as Windows-1252, because those two encodings in particular are
641    // easily confused.
642    if possible_1byte_encodings.contains(&&sloppy::CodecType::Latin1) {
643        if possible_1byte_encodings.contains(&&sloppy::CodecType::SloppyWindows1252) {
644            // This text is in the intersection of Latin-1 and
645            // Windows-1252, so it's probably legit.
646            return ExplainedText { text, steps: None };
647        } else {
648            // Otherwise, it means we have characters that are in Latin-1 but
649            // not in Windows-1252. Those are C1 control characters. Nobody
650            // wants those. Assume they were meant to be Windows-1252.
651            let encoded = LATIN_1.encode(&text);
652            if let Ok(encoded) = encoded {
653                let fixed = WINDOWS_1252.decode(&encoded);
654                if fixed != text {
655                    let steps = if explain {
656                        Some(vec![
657                            ExplanationStep {
658                                transformation: String::from("encode latin-1"),
659                            },
660                            ExplanationStep {
661                                transformation: String::from("decode windows-1252"),
662                            },
663                        ])
664                    } else {
665                        None
666                    };
667
668                    return ExplainedText { text: fixed, steps };
669                }
670            }
671        }
672    }
673
674    // Fix individual characters of Latin-1 with a less satisfying explanation
675    if config.fix_c1_controls {
676        let fixed = fix_c1_controls(&text);
677        let steps = if explain {
678            Some(vec![ExplanationStep {
679                transformation: String::from("fix_c1_controls"),
680            }])
681        } else {
682            None
683        };
684        return ExplainedText {
685            text: fixed.into(),
686            steps,
687        };
688    }
689
690    // The cases that remain are mixups between two different single-byte
691    // encodings, and not the common case of Latin-1 vs. Windows-1252.
692    //
693    // With the new heuristic in 6.0, it's possible that we're closer to solving
694    // these in some cases. It would require a lot of testing and tuning, though.
695    // For now, we leave the text unchanged in these cases.
696    ExplainedText { text, steps: None }
697}
698
699#[cfg(test)]
700mod tests {
701    use super::fix_text;
702    use pretty_assertions::assert_eq;
703
704    #[test]
705    fn test_messy_language_names_czech() {
706        let original = "ÄŒeÅ¡tina";
707        let expected = "Čeština";
708        let result = fix_text(original, None);
709        assert_eq!(result, expected);
710    }
711
712    #[test]
713    fn test_messy_language_names_gaelic() {
714        let original = "GÃ idhlig";
715        let expected = "Gàidhlig";
716        let result = fix_text(original, None);
717        assert_eq!(result, expected);
718    }
719
720    #[test]
721    fn test_messy_language_names_lithuanian() {
722        let original = "LietuviÅ³";
723        let expected = "Lietuvių";
724        let result = fix_text(original, None);
725        assert_eq!(result, expected);
726    }
727
728    #[test]
729    fn test_messy_language_names_slovak() {
730        let original = "SlovenÄ�ina";
731        let expected = "Sloven�ina";
732        let result = fix_text(original, None);
733        assert_eq!(result, expected);
734    }
735
736    #[test]
737    fn test_messy_language_names_vietnamese() {
738        let original = "Tiáº¿ng Viá»‡t";
739        let expected = "Tiếng Việt";
740        let result = fix_text(original, None);
741        assert_eq!(result, expected);
742    }
743
744    #[test]
745    fn test_messy_language_names_greek() {
746        let original = "Î•Î»Î»Î·Î½Î¹ÎºÎ¬";
747        let expected = "Ελληνικά";
748        let result = fix_text(original, None);
749        assert_eq!(result, expected);
750    }
751
752    #[test]
753    fn test_messy_language_names_bulgarian() {
754        let original = "Ð±ÑŠÐ»Ð³Ð°Ñ€Ñ�ÐºÐ¸ ÐµÐ·Ð¸Ðº";
755        let expected = "българ�ки език";
756        let result = fix_text(original, None);
757        assert_eq!(result, expected);
758    }
759
760    #[test]
761    fn test_messy_language_names_russian() {
762        let original = "Ð ÑƒÑ�Ñ�ÐºÐ¸Ð¹";
763        let expected = "Ру��кий";
764        let result = fix_text(original, None);
765        assert_eq!(result, expected);
766    }
767
768    #[test]
769    fn test_messy_language_names_serbian_cyrillic() {
770        let original = "CÑ€Ð¿Ñ�ÐºÐ¸ [Ñ›Ð¸Ñ€Ð¸Ð»Ð¸Ñ†Ð¾Ð¼]";
771        let expected = "Cрп�ки [ћирилицом]";
772        let result = fix_text(original, None);
773        assert_eq!(result, expected);
774    }
775
776    #[test]
777    fn test_messy_language_names_hebrew() {
778        let original = "×¢×‘×¨×™×ª";
779        let expected = "עברית";
780        let result = fix_text(original, None);
781        assert_eq!(result, expected);
782    }
783
784    #[test]
785    fn test_messy_language_names_russian_2() {
786        let original = "Ð ÑƒÑ�Ñ�ÐºÐ¸Ð¹";
787        let expected = "Ру��кий";
788        let result = fix_text(original, None);
789        assert_eq!(result, expected);
790    }
791
792    #[test]
793    fn test_messy_language_names_hindi() {
794        let original = "à¤¹à¤¿à¤¨à¥�à¤¦à¥€";
795        let expected = "हिन�दी";
796        let result = fix_text(original, None);
797        assert_eq!(result, expected);
798    }
799
800    #[test]
801    fn test_messy_language_names_tamil() {
802        let original = "à®¤à®®à®¿à®´à¯�";
803        let expected = "தமிழ�";
804        let result = fix_text(original, None);
805        assert_eq!(result, expected);
806    }
807
808    #[test]
809    fn test_messy_language_names_thai() {
810        let original = "à¸ à¸²à¸©à¸²à¹„à¸—à¸¢";
811        let expected = "ภาษาไทย";
812        let result = fix_text(original, None);
813        assert_eq!(result, expected);
814    }
815
816    #[test]
817    fn test_messy_language_names_simplified_chinese() {
818        let original = "ç®€ä½“ä¸æ–‡";
819        let expected = "简体中文";
820        let result = fix_text(original, None);
821        assert_eq!(result, expected);
822    }
823
824    #[test]
825    fn test_messy_language_names_traditional_chinese() {
826        let original = "æ£é«”ä¸æ–‡";
827        let expected = "正體中文";
828        let result = fix_text(original, None);
829        assert_eq!(result, expected);
830    }
831
832    #[test]
833    fn test_messy_language_names_japanese() {
834        let original = "æ—¥æœ¬èªž";
835        let expected = "日本語";
836        let result = fix_text(original, None);
837        assert_eq!(result, expected);
838    }
839
840    #[test]
841    fn test_messy_language_names_korean() {
842        let original = "í•œêµì–´";
843        let expected = "한국어";
844        let result = fix_text(original, None);
845        assert_eq!(result, expected);
846    }
847
848    #[test]
849    fn test_low_codepoint_emoji() {
850        let original = "He's Justinâ¤";
851        let expected = "He's Justin❤";
852        let result = fix_text(original, None);
853        assert_eq!(result, expected);
854    }
855
856    #[test]
857    fn test_utf8_macroman_mix_up_about_smurfs() {
858        let original = "Le Schtroumpf Docteur conseille g√¢teaux et baies schtroumpfantes pour un r√©gime √©quilibr√©.";
859        let expected = "Le Schtroumpf Docteur conseille gâteaux et baies schtroumpfantes pour un régime équilibré.";
860        let result = fix_text(original, None);
861        assert_eq!(result, expected);
862    }
863
864    #[test]
865    fn test_checkmark_that_almost_looks_okay_as_mojibake() {
866        let original = "âœ” No problems";
867        let expected = "✔ No problems";
868        let result = fix_text(original, None);
869        assert_eq!(result, expected);
870    }
871
872    #[test]
873    fn test_utf8_windows_1251_russian_mixup_about_futbol() {
874        let original = "РґРѕСЂРѕРіРµ РР·-РїРѕРґ #С„СѓС‚Р±РѕР»";
875        let expected = "дороге Из-под #футбол";
876        let result = fix_text(original, None);
877        assert_eq!(result, expected);
878    }
879
880    #[test]
881    fn test_latin1_windows_1252_mixup_in_german() {
882        let original = "Handwerk bringt dich überall hin: Von der YOU bis nach Monaco";
883        let expected = "\"Handwerk bringt dich überall hin\": Von der YOU bis nach Monaco";
884        let result = fix_text(original, None);
885        assert_eq!(result, expected);
886    }
887
888    #[test]
889    fn test_latin1_windows_1252_mixup_of_the_replacement_character() {
890        let original = "Some comments may be republished on the website or in the newspaper ï¿½ email addresses will not be published.";
891        let expected = "Some comments may be republished on the website or in the newspaper � email addresses will not be published.";
892        let result = fix_text(original, None);
893        assert_eq!(result, expected);
894    }
895
896    #[test]
897    fn test_cesu8_windows_1252_emoji() {
898        let original = "Hi guys í ½í¸";
899        let expected = "Hi guys 😍";
900        let result = fix_text(original, None);
901        assert_eq!(result, expected);
902    }
903
904    #[test]
905    fn test_cesu8_latin1_emoji() {
906        let original = "hihi RT username: âºí ½í¸";
907        let expected = "hihi RT username: ☺😘";
908        let result = fix_text(original, None);
909        assert_eq!(result, expected);
910    }
911
912    #[test]
913    fn test_latin1_windows_1252_mixup_in_turkish() {
914        let original = "Beta Haber: HÄ±rsÄ±zÄ± BÃ¼yÃ¼ Korkuttu";
915        let expected = "Beta Haber: Hırsızı Büyü Korkuttu";
916        let result = fix_text(original, None);
917        assert_eq!(result, expected);
918    }
919
920    #[test]
921    fn test_utf8_windows_1251_mixed_up_twice_in_russian() {
922        let original = "Р С—РЎР‚Р С‘РЎРЏРЎвЂљР Р…Р С•РЎРѓРЎвЂљР С‘. РІСњВ¤";
923        let expected = "приятности. ❤";
924        let result = fix_text(original, None);
925        assert_eq!(result, expected);
926    }
927
928    #[test]
929    fn test_utf8_windows_1252_mixed_up_twice_in_malay() {
930        let original = "Kayanya laptopku error deh, soalnya tiap mau ngetik deket-deket kamu font yg keluar selalu Times New Ã¢â‚¬Å“ RomanceÃ¢â‚¬Â.";
931        let expected = "Kayanya laptopku error deh, soalnya tiap mau ngetik deket-deket kamu font yg keluar selalu Times New \" Romance\".";
932        let result = fix_text(original, None);
933        assert_eq!(result, expected);
934    }
935
936    #[test]
937    fn test_utf8_windows_1252_mixed_up_twice_in_naming_iggy_pop() {
938        let original = "Iggy Pop (nÃƒÂ© Jim Osterberg)";
939        let expected = "Iggy Pop (né Jim Osterberg)";
940        let result = fix_text(original, None);
941        assert_eq!(result, expected);
942    }
943
944    #[test]
945    fn test_left_quote_is_utf8_right_quote_is_latin1_both_encoded_in_windows_1252() {
946        let original = "Direzione Pd, ok âsenza modifiche all'Italicum.";
947        let expected = "Direzione Pd, ok \"senza modifiche\" all'Italicum.";
948        let result = fix_text(original, None);
949        assert_eq!(result, expected);
950    }
951
952    #[test]
953    fn test_utf8_sloppy_windows_1252_mixed_up_twice_in_a_triumphant_emoticon() {
954        let original = "selamat berpuasa sob (Ã Â¸â€¡'ÃŒâ‚¬Ã¢Å’Â£'ÃŒÂ)Ã Â¸â€¡";
955        let expected = "selamat berpuasa sob (ง'̀⌣'́)ง";
956        let result = fix_text(original, None);
957        assert_eq!(result, expected);
958    }
959
960    #[test]
961    fn test_utf8_windows_1252_mixed_up_three_times() {
962        let original = "The Mona Lisa doesnÃƒÂ¢Ã¢â€šÂ¬Ã¢â€žÂ¢t have eyebrows.";
963        let expected = "The Mona Lisa doesn't have eyebrows.";
964        let result = fix_text(original, None);
965        assert_eq!(result, expected);
966    }
967
968    #[test]
969    fn test_utf8_codepag_437_mixup_in_russian() {
970        let original = "#╨┐╤Ç╨░╨▓╨╕╨╗╤î╨╜╨╛╨╡╨┐╨╕╤é╨░╨╜╨╕╨╡";
971        let expected = "#правильноепитание";
972        let result = fix_text(original, None);
973        assert_eq!(result, expected);
974    }
975
976    #[test]
977    fn test_utf8_windows_1252_mixup_in_french() {
978        let original = "HÃ´tel de Police";
979        let expected = "Hôtel de Police";
980        let result = fix_text(original, None);
981        assert_eq!(result, expected);
982    }
983
984    #[test]
985    fn test_utf8_windows_1250_mixup_in_french() {
986        let original = "LiĂ¨ge Avenue de l'HĂ´pital";
987        let expected = "Liège Avenue de l'Hôpital";
988        let result = fix_text(original, None);
989        assert_eq!(result, expected);
990    }
991
992    #[test]
993    fn test_utf8_windows_1252_mixup_in_vietnamese() {
994        let original = "Táº¡i sao giÃ¡ háº¡t sáº§u riÃªng láº¡i lÃªn giÃ¡?";
995        let expected = "Tại sao giá hạt sầu riêng lại lên giá?";
996        let result = fix_text(original, None);
997        assert_eq!(result, expected);
998    }
999
1000    #[test]
1001    fn test_negative_using_diaereses_as_quotation_marks_in_greek() {
1002        let original = "Η ¨ανατροφή¨ δυστυχώς από τους προπονητές";
1003        let expected = "Η ¨ανατροφή¨ δυστυχώς από τους προπονητές";
1004        let result = fix_text(original, None);
1005        assert_eq!(result, expected);
1006    }
1007
1008    #[test]
1009    fn test_science_mid_word_greek_letter_gets_fixed_correctly() {
1010        let original = "Humanized HLA-DR4.RagKO.IL2RÎ³cKO.NOD (DRAG) mice sustain the complex vertebrate life cycle of Plasmodium falciparum malaria.";
1011        let expected = "Humanized HLA-DR4.RagKO.IL2RγcKO.NOD (DRAG) mice sustain the complex vertebrate life cycle of Plasmodium falciparum malaria.";
1012        let result = fix_text(original, None);
1013        assert_eq!(result, expected);
1014    }
1015
1016    #[test]
1017    fn test_negative_more_science_dont_fix_a_multiplication_symbol_in_quotes() {
1018        let original = "higher values (“+” and “×” curves) in the superficial region";
1019        let expected = "higher values (\"+\" and \"×\" curves) in the superficial region";
1020        let result = fix_text(original, None);
1021        assert_eq!(result, expected);
1022    }
1023
1024    #[test]
1025    fn test_for_goodness_sake_we_can_come_close_to_fixing_this_but_fail_in_the_last_step() {
1026        let original = "ItÃ?Â¢â?¬â?¢s classic. ItÃ?Â¢â?¬â?¢s epic. ItÃ?Â¢â?¬â?¢s ELIZABETH BENNET for goodnessÃ?Â¢â?¬â?¢ sake!";
1027        let expected =
1028            "It�¢��s classic. It�¢��s epic. It�¢��s ELIZABETH BENNET for goodness�¢�� sake!";
1029        let result = fix_text(original, None);
1030        assert_eq!(result, expected);
1031    }
1032
1033    #[test]
1034    fn test_lossy_utf8_windows_1250_mixup_in_spanish() {
1035        let original =
1036            "Europa, Asia, Ă�frica, Norte, AmĂ©rica Central y del Sur, Australia y OceanĂa";
1037        let expected =
1038            "Europa, Asia, �frica, Norte, América Central y del Sur, Australia y Oceanía";
1039        let result = fix_text(original, None);
1040        assert_eq!(result, expected);
1041    }
1042
1043    #[test]
1044    fn test_utf8_sloppy_windows_1250_mixup_in_english() {
1045        let original = "It was namedÂ â€žscarsÂ´ stonesâ€ś after the rock-climbers who got hurt while climbing on it.";
1046        let expected = "It was named \"scars´ stones\" after the rock-climbers who got hurt while climbing on it.";
1047        let result = fix_text(original, None);
1048        assert_eq!(result, expected);
1049    }
1050
1051    #[test]
1052    fn test_the_same_text_as_above_but_as_a_utf8_iso_8859_2_mixup() {
1053        let original = "It was namedÂ âscarsÂ´ stonesâ after the rock-climbers who got hurt while climbing on it.";
1054        let expected = "It was named \"scars´ stones\" after the rock-climbers who got hurt while climbing on it.";
1055        let result = fix_text(original, None);
1056        assert_eq!(result, expected);
1057    }
1058
1059    #[test]
1060    fn test_utf8_windows1252_mixup_in_mixed_french_and_arabic() {
1061        let original = "Ã€ tous mes frÃ¨res et soeurs dans la syriennetÃ© comme dans l’humanitÃ©, sans discrimination aucune, je vous souhaite bonne fÃªte Ø¹ÙŠØ¯ Ø³Ø¹ÙŠØ¯.Que la paix, la libertÃ©, l’Ã©galitÃ©, la fraternitÃ© et la dignitÃ© soient avec vous.Pardonnez ce ton un peu ecclÃ©siastique.";
1062        let expected = "À tous mes frères et soeurs dans la syrienneté comme dans l'humanité, sans discrimination aucune, je vous souhaite bonne fête عيد سعيد.Que la paix, la liberté, l'égalité, la fraternité et la dignité soient avec vous.Pardonnez ce ton un peu ecclésiastique.";
1063        let result = fix_text(original, None);
1064        assert_eq!(result, expected);
1065    }
1066
1067    #[test]
1068    fn test_utf8_sloppy_windows_1250_mixup_in_romanian() {
1069        let original = "vedere Ă®nceĹŁoĹźatÄ";
1070        let expected = "vedere înceţoşată";
1071        let result = fix_text(original, None);
1072        assert_eq!(result, expected);
1073    }
1074
1075    #[test]
1076    fn test_utf8_windows_1250_mixup_in_slovak() {
1077        let original = "NapĂĹˇte nĂˇm !";
1078        let expected = "Napíšte nám !";
1079        let result = fix_text(original, None);
1080        assert_eq!(result, expected);
1081    }
1082
1083    #[test]
1084    fn test_utf8_windows_1252_mixup_in_spanish() {
1085        let original = "DOS AÃ‘OS";
1086        let expected = "DOS AÑOS";
1087        let result = fix_text(original, None);
1088        assert_eq!(result, expected);
1089    }
1090
1091    #[test]
1092    fn test_utf8_windows_1252_followed_by_utf8_windows_1251() {
1093        let original =
1094            "a bigger-than-expected Г‚ВЈ5.8bn rights issue to satisfy the new banking regulator";
1095        let expected =
1096            "a bigger-than-expected £5.8bn rights issue to satisfy the new banking regulator";
1097        let result = fix_text(original, None);
1098        assert_eq!(result, expected);
1099    }
1100
1101    #[test]
1102    fn test_fancy_unicode_crossing_out_but_mojibaked() {
1103        let original = "hotel $49 $Ì¶6Ì¶3Ì¶ updated 2018";
1104        let expected = "hotel $49 $̶6̶3̶ updated 2018";
1105        let result = fix_text(original, None);
1106        assert_eq!(result, expected);
1107    }
1108
1109    #[test]
1110    fn test_a_face_with_utf8_sloppy_windows_1252_mixed_up_twice() {
1111        let original = "Ã¢â€â€™(Ã¢Å’Â£Ã‹â€ºÃ¢Å’Â£)Ã¢â€Å½";
1112        let expected = "┒(⌣˛⌣)┎";
1113        let result = fix_text(original, None);
1114        assert_eq!(result, expected);
1115    }
1116
1117    #[test]
1118    fn test_we_can_mostly_decode_the_face_above_when_we_lose_the_character_u009d() {
1119        let original = "Ã¢â€�â€™(Ã¢Å’Â£Ã‹â€ºÃ¢Å’Â£)Ã¢â€�Å½";
1120        let expected = "�(⌣˛⌣)�";
1121        let result = fix_text(original, None);
1122        assert_eq!(result, expected);
1123    }
1124
1125    #[test]
1126    fn test_lossy_decoding_can_have_plain_ascii_question_marks_as_well() {
1127        let original = "The ICR has been upgraded to â€œbb+â€? from â€œbbâ€?";
1128        let expected = "The ICR has been upgraded to \"bb+� from \"bb�";
1129        let result = fix_text(original, None);
1130        assert_eq!(result, expected);
1131    }
1132
1133    #[test]
1134    fn test_cesu8_latin_1_mixup_over_several_emoji() {
1135        let original =
1136            "I just figured out how to tweet emojis! â\u{009a}½í\u{00a0}½í¸\u{0080}í\u{00a0}½í¸\u{0081}í\u{00a0}½í¸\u{0082}í\u{00a0}½í¸\u{0086}í\u{00a0}½í¸\u{008e}í\u{00a0}½í¸\u{008e}í\u{00a0}½í¸\u{008e}í\u{00a0}½í¸\u{008e}";
1137        let expected = "I just figured out how to tweet emojis! ⚽😀😁😂😆😎😎😎😎";
1138        let result = fix_text(original, None);
1139        assert_eq!(result, expected);
1140    }
1141
1142    #[test]
1143    fn test_an_absolutely_hopeless_garble() {
1144        let original = "ã†â€™ãƒâ€ ã¢â‚¬â„¢ãƒæ’ã‚â¢ãƒâ¢ã¢â‚¬å¡ã‚â¬ãƒâ€šã‚â";
1145        let expected = "ã†â€™ãƒâ€ ã¢â'¬â\"¢ãƒæ'ã'â¢ãƒâ¢ã¢â'¬å¡ã'â¬ãƒâ€šã'â";
1146        let result = fix_text(original, None);
1147        assert_eq!(result, expected);
1148    }
1149
1150    #[test]
1151    fn test_inconsistent_utf8_latin1_mojibake() {
1152        let original =
1153            "Ecuadorâs âpurely political decision on Assangeâ is likely result of âUS pressureâ";
1154        let expected =
1155            "Ecuador's 'purely political decision on Assange' is likely result of 'US pressure'…";
1156        let result = fix_text(original, None);
1157        assert_eq!(result, expected);
1158    }
1159
1160    #[test]
1161    fn test_inconsistent_utf8_latin1_mojibake_with_an_ellipsis_from_the_windows_1252_character_set()
1162    {
1163        let original =
1164            "Ecuadorâs âpurely political decision on Assangeâ is likely result of âUS pressureâ…";
1165        let expected =
1166            "Ecuador's 'purely political decision on Assange' is likely result of 'US pressure'…";
1167        let result = fix_text(original, None);
1168        assert_eq!(result, expected);
1169    }
1170
1171    #[test]
1172    fn test_inconsistent_mojibake_in_portuguese() {
1173        let original = "Campeonatos > III DivisÃ£o - SÃ©rie F > Jornadas Classificação";
1174        let expected = "Campeonatos > III Divisão - Série F > Jornadas Classificação";
1175        let result = fix_text(original, None);
1176        assert_eq!(result, expected);
1177    }
1178
1179    #[test]
1180    fn test_handle_afrikaans_n_character() {
1181        let original = "ŉ Chloroplas is ŉ organel wat in fotosinterende plante voorkom.";
1182        let expected = "'n Chloroplas is 'n organel wat in fotosinterende plante voorkom.";
1183        let result = fix_text(original, None);
1184        assert_eq!(result, expected);
1185    }
1186
1187    #[test]
1188    fn test_handle_croatian_single_codepoint_digraphs() {
1189        let original = "izum „bootstrap load“ koji je korišteǌem polisilicijskog sloja proizveo dovoǉno dobre kondenzatore na čipu";
1190        let expected = "izum \"bootstrap load\" koji je korištenjem polisilicijskog sloja proizveo dovoljno dobre kondenzatore na čipu";
1191        let result = fix_text(original, None);
1192        assert_eq!(result, expected);
1193    }
1194
1195    #[test]
1196    fn test_a_with_an_acute_accent_in_isolation() {
1197        let original = "NicolÃ¡s";
1198        let expected = "Nicolás";
1199        let result = fix_text(original, None);
1200        assert_eq!(result, expected);
1201    }
1202
1203    #[test]
1204    fn test_sharp_s_in_isolation_via_macroman_encoding() {
1205        let original = "wei√ü";
1206        let expected = "weiß";
1207        let result = fix_text(original, None);
1208        assert_eq!(result, expected);
1209    }
1210
1211    #[test]
1212    fn test_negative_è_preceded_by_a_non_breaking_space_is_not_a_small_capital_y() {
1213        let original =
1214            "Con il corpo e lo spirito ammaccato, è come se nel cuore avessi un vetro conficcato.";
1215        let expected =
1216            "Con il corpo e lo spirito ammaccato, è come se nel cuore avessi un vetro conficcato.";
1217        let result = fix_text(original, None);
1218        assert_eq!(result, expected);
1219    }
1220
1221    #[test]
1222    fn test_negative_multiplication_sign_and_ellipsis() {
1223        let original = "4288×…";
1224        let expected = "4288×…";
1225        let result = fix_text(original, None);
1226        assert_eq!(result, expected);
1227    }
1228
1229    #[test]
1230    fn test_negative_accents_are_sometimes_used_as_quotes() {
1231        let original = "``toda produzida pronta pra assa aí´´";
1232        let expected = "``toda produzida pronta pra assa aí´´";
1233        let result = fix_text(original, None);
1234        assert_eq!(result, expected);
1235    }
1236
1237    #[test]
1238    fn test_negative_õ_followed_by_an_ellipsis() {
1239        let original = "HUHLL Õ…";
1240        let expected = "HUHLL Õ…";
1241        let result = fix_text(original, None);
1242        assert_eq!(result, expected);
1243    }
1244
1245    #[test]
1246    fn test_negative_ê_followed_by_an_ellipsis() {
1247        let original = "RETWEET SE VOCÊ…";
1248        let expected = "RETWEET SE VOCÊ…";
1249        let result = fix_text(original, None);
1250        assert_eq!(result, expected);
1251    }
1252
1253    #[test]
1254    fn test_negative_é_followed_by_an_ellipsis() {
1255        let original = "PARCE QUE SUR LEURS PLAQUES IL Y MARQUÉ…";
1256        let expected = "PARCE QUE SUR LEURS PLAQUES IL Y MARQUÉ…";
1257        let result = fix_text(original, None);
1258        assert_eq!(result, expected);
1259    }
1260
1261    #[test]
1262    fn test_negative_ó_followed_by_an_ellipsis() {
1263        let original = "TEM QUE SEGUIR, SDV SÓ…";
1264        let expected = "TEM QUE SEGUIR, SDV SÓ…";
1265        let result = fix_text(original, None);
1266        assert_eq!(result, expected);
1267    }
1268
1269    #[test]
1270    fn test_negative_é_followed_by_a_curly_apostrophe() {
1271        let original = "Join ZZAJÉ’s Official Fan List and receive news, events, and more!";
1272        let expected = "Join ZZAJÉ's Official Fan List and receive news, events, and more!";
1273        let result = fix_text(original, None);
1274        assert_eq!(result, expected);
1275    }
1276
1277    #[test]
1278    fn test_negative_é_preceded_by_curly_apostrophe() {
1279        let original = "L’épisode 8 est trop fou ouahh";
1280        let expected = "L'épisode 8 est trop fou ouahh";
1281        let result = fix_text(original, None);
1282        assert_eq!(result, expected);
1283    }
1284
1285    #[test]
1286    fn test_negative_three_raised_eyebrows_or_something() {
1287        let original = "Ôôô VIDA MINHA";
1288        let expected = "Ôôô VIDA MINHA";
1289        let result = fix_text(original, None);
1290        assert_eq!(result, expected);
1291    }
1292
1293    #[test]
1294    fn test_negative_copyright_sign_preceded_by_non_breaking_space() {
1295        let original = "[x] ©";
1296        let expected = "[x] ©";
1297        let result = fix_text(original, None);
1298        assert_eq!(result, expected);
1299    }
1300
1301    #[test]
1302    fn test_negative_en_dash_and_infinity_sign() {
1303        let original = "2012—∞";
1304        let expected = "2012—∞";
1305        let result = fix_text(original, None);
1306        assert_eq!(result, expected);
1307    }
1308
1309    #[test]
1310    fn test_negative_this_e_is_a_ukrainian_letter_but_nothing_else_is_wrong() {
1311        let original = "SENSЕ - Oleg Tsedryk";
1312        let expected = "SENSЕ - Oleg Tsedryk";
1313        let result = fix_text(original, None);
1314        assert_eq!(result, expected);
1315    }
1316
1317    #[test]
1318    fn test_negative_angry_face() {
1319        let original = "OK??:(   `¬´    ):";
1320        let expected = "OK??:(   `¬´    ):";
1321        let result = fix_text(original, None);
1322        assert_eq!(result, expected);
1323    }
1324
1325    #[test]
1326    fn test_negative_synthetic_face_with_glasses_and_a_raised_eyebrow() {
1327        let original = "( o¬ô )";
1328        let expected = "( o¬ô )";
1329        let result = fix_text(original, None);
1330        assert_eq!(result, expected);
1331    }
1332
1333    #[test]
1334    fn test_negative_triangle_and_degree_sign() {
1335        let original = "∆°";
1336        let expected = "∆°";
1337        let result = fix_text(original, None);
1338        assert_eq!(result, expected);
1339    }
1340
1341    #[test]
1342    fn test_negative_portuguese_with_inverted_question_mark() {
1343        let original = "ESSE CARA AI QUEM É¿";
1344        let expected = "ESSE CARA AI QUEM É¿";
1345        let result = fix_text(original, None);
1346        assert_eq!(result, expected);
1347    }
1348
1349    #[test]
1350    fn test_negative_portuguese_with_acute_accents_as_quotation_marks() {
1351        let original = "``hogwarts nao existe, voce nao vai pegar o trem pra lá´´";
1352        let expected = "``hogwarts nao existe, voce nao vai pegar o trem pra lá´´";
1353        let result = fix_text(original, None);
1354        assert_eq!(result, expected);
1355    }
1356
1357    #[test]
1358    fn test_negative_finnish_ä_followed_by_a_non_breaking_space() {
1359        let original = "SELKÄ EDELLÄ MAAHAN via @YouTube";
1360        let expected = "SELKÄ EDELLÄ MAAHAN via @YouTube";
1361        let result = fix_text(original, None);
1362        assert_eq!(result, expected);
1363    }
1364
1365    #[test]
1366    fn test_negative_multiplying_by_currency() {
1367        let original = "Offering 5×£35 pin ups";
1368        let expected = "Offering 5×£35 pin ups";
1369        let result = fix_text(original, None);
1370        assert_eq!(result, expected);
1371    }
1372
1373    #[test]
1374    fn test_negative_registered_chocolate_brand_name() {
1375        let original = "NESTLÉ® requiere contratar personal para diferentes areas a nivel nacional e internacional";
1376        let expected = "NESTLÉ® requiere contratar personal para diferentes areas a nivel nacional e internacional";
1377        let result = fix_text(original, None);
1378        assert_eq!(result, expected);
1379    }
1380
1381    #[test]
1382    fn test_mostly_negative_we_only_need_to_fix_c1_control_characters() {
1383        let original = "C'est vrai que nous n'en avons pas encore beaucoup parlé Tu sais, ça fait de nombreuses années";
1384        let expected = "C'est vrai que nous n'en avons pas encore beaucoup parlé… Tu sais, ça fait de nombreuses années";
1385        let result = fix_text(original, None);
1386        assert_eq!(result, expected);
1387    }
1388
1389    #[test]
1390    fn test_french_example_containing_non_breaking_spaces() {
1391        let original = "ART TRIP Ã  l'office de tourisme";
1392        let expected = "ART TRIP à l'office de tourisme";
1393        let result = fix_text(original, None);
1394        assert_eq!(result, expected);
1395    }
1396
1397    #[test]
1398    fn test_english_example_in_utf8_windows_1251_with_a_ligature() {
1399        let original = "This is signiп¬Ѓcantly lower than the respective share";
1400        let expected = "This is significantly lower than the respective share";
1401        let result = fix_text(original, None);
1402        assert_eq!(result, expected);
1403    }
1404
1405    #[test]
1406    fn test_synthetic_we_can_recognize_ã_in_some_cases_when_its_the_only_mojibake() {
1407        let original = "voilÃ  le travail";
1408        let expected = "voilà le travail";
1409        let result = fix_text(original, None);
1410        assert_eq!(result, expected);
1411    }
1412
1413    #[test]
1414    fn test_synthetic_we_can_recognize_ã_at_the_end_of_a_word_when_it_absorbs_a_following_space() {
1415        let original = "voilÃ le travail";
1416        let expected = "voilà le travail";
1417        let result = fix_text(original, None);
1418        assert_eq!(result, expected);
1419    }
1420
1421    #[test]
1422    fn test_negative_we_dont_fix_ã_in_all_contexts() {
1423        let original = "C O N C L U S Ã O";
1424        let expected = "C O N C L U S Ã O";
1425        let result = fix_text(original, None);
1426        assert_eq!(result, expected);
1427    }
1428
1429    #[test]
1430    fn test_à_remains_its_own_word_even_if_spaces_after_it_get_coalesced_into_one() {
1431        let original = "Ã perturber la rÃ©flexion des thÃ©ologiens jusqu'Ã nos jours";
1432        let expected = "à perturber la réflexion des théologiens jusqu'à nos jours";
1433        let result = fix_text(original, None);
1434        assert_eq!(result, expected);
1435    }
1436
1437    #[test]
1438    fn test_fix_à_in_inconsistent_mojibake() {
1439        let original = "Le barÃ¨me forfaitaire permet l’Ã©valuation des frais de dÃ©placement relatifs Ã l’utilisation";
1440        let expected = "Le barème forfaitaire permet l'évaluation des frais de déplacement relatifs à l'utilisation";
1441        let result = fix_text(original, None);
1442        assert_eq!(result, expected);
1443    }
1444
1445    #[test]
1446    fn test_the_portuguese_word_às_does_not_become_à_s_due_to_the_french_fix() {
1447        let original = "com especial atenÃ§Ã£o Ã s crianÃ§as";
1448        let expected = "com especial atenção às crianças";
1449        let result = fix_text(original, None);
1450        assert_eq!(result, expected);
1451    }
1452
1453    #[test]
1454    fn test_this_is_why_we_require_a_space_after_the_s_in_às() {
1455        let original = "TroisiÃ¨me Ã©dition pour ce festival qui persiste et signe Ã s'Ã©loigner des grands axes pour prendre les contre-allÃ©es en 16 concerts dans 7 villes de 2 pays voisins.";
1456        let expected = "Troisième édition pour ce festival qui persiste et signe à s'éloigner des grands axes pour prendre les contre-allées en 16 concerts dans 7 villes de 2 pays voisins.";
1457        let result = fix_text(original, None);
1458        assert_eq!(result, expected);
1459    }
1460
1461    #[test]
1462    fn test_we_can_fix_à_in_windows_1251_sometimes_as_well() {
1463        let original = "La rГ©gion de Dnepropetrovsk se trouve Г lвЂ™ouest de lвЂ™Ukraine";
1464        let expected = "La région de Dnepropetrovsk se trouve à l'ouest de l'Ukraine";
1465        let result = fix_text(original, None);
1466        assert_eq!(result, expected);
1467    }
1468
1469    #[test]
1470    fn test_ã_quele_is_the_portuguese_word_àquele_not_à_quele() {
1471        let original = "eliminado o antÃgeno e mantidos os nÃveis de anticorpos, surgem as condiÃ§Ãµes necessÃ¡rias ao estabelecimento do granuloma, semelhante Ã quele observado nas lesÃµes por imunocomplexo em excesso de anticorpos";
1472        let expected = "eliminado o antígeno e mantidos os níveis de anticorpos, surgem as condições necessárias ao estabelecimento do granuloma, semelhante àquele observado nas lesões por imunocomplexo em excesso de anticorpos";
1473        let result = fix_text(original, None);
1474        assert_eq!(result, expected);
1475    }
1476
1477    #[test]
1478    fn test_a_complex_lossy_pile_up_of_mojibake_in_portuguese() {
1479        let original = "â € ðŸ“�Â Regulamento: â € âš ï¸� As pessoas que marcarem nos comentÃ¡rios perfis empresariais e/ou de marcas, personalidades ou fake serÃ£o desclassificadas. âš ï¸� Podem participar pessoas residentes em Petrolina/PE ou Juazeiro/BA, desde que se comprometam a retirar o prÃªmio em nosso endereÃ§o. FuncionÃ¡rios estÃ£o vetados. âš ï¸� SerÃ£o vÃ¡lidos os comentÃ¡rios postados atÃ© 16h, do dia 31/03/2018. E o resultado serÃ¡ divulgado atÃ© Ã s 19h do mesmo dia em uma nova publicaÃ§Ã£o em nosso instagram. â € Boa sorte!!!Â ðŸ˜€ðŸ�°";
1480        let expected = "⠀ � Regulamento: ⠀ ⚠� As pessoas que marcarem nos comentários perfis empresariais e/ou de marcas, personalidades ou fake serão desclassificadas. ⚠� Podem participar pessoas residentes em Petrolina/PE ou Juazeiro/BA, desde que se comprometam a retirar o prêmio em nosso endereço. Funcionários estão vetados. ⚠� Serão válidos os comentários postados até 16h, do dia 31/03/2018. E o resultado será divulgado até às 19h do mesmo dia em uma nova publicação em nosso instagram. ⠀ Boa sorte!!! 😀�";
1481        let result = fix_text(original, None);
1482        assert_eq!(result, expected);
1483    }
1484
1485    #[test]
1486    fn test_utf8_windows_1252_mixup_in_gaelic_involving_non_breaking_spaces() {
1487        let original = "CÃ nan nan GÃ idheal";
1488        let expected = "Cànan nan Gàidheal";
1489        let result = fix_text(original, None);
1490        assert_eq!(result, expected);
1491    }
1492
1493    #[test]
1494    fn test_misleading_mix_up_in_spanish() {
1495        let original = "tiene demora y está \u{0093}próximo a resolverse\u{0094}";
1496        let expected = "tiene demora y está \"próximo a resolverse\"";
1497        let result = fix_text(original, None);
1498        assert_eq!(result, expected);
1499    }
1500
1501    #[test]
1502    fn test_punctuation_pile_up_should_actually_be_musical_notes() {
1503        let original = "Engkau masih yg terindah, indah di dalam hatikuâ™«~";
1504        let expected = "Engkau masih yg terindah, indah di dalam hatiku♫~";
1505        let result = fix_text(original, None);
1506        assert_eq!(result, expected);
1507    }
1508
1509    #[test]
1510    fn test_utf8_windows_1251_mixup_in_tweet_spam() {
1511        let original = "Blog Traffic Tip 2 вЂ“ Broadcast Email Your Blog";
1512        let expected = "Blog Traffic Tip 2 – Broadcast Email Your Blog";
1513        let result = fix_text(original, None);
1514        assert_eq!(result, expected);
1515    }
1516
1517    #[test]
1518    fn test_utf8_windows_1251_mixup() {
1519        let original = "S&P Confirms UkrsotsbankвЂ™s вЂњB-вЂњ Rating";
1520        let expected = "S&P Confirms Ukrsotsbank's \"B-\" Rating";
1521        let result = fix_text(original, None);
1522        assert_eq!(result, expected);
1523    }
1524
1525    #[test]
1526    fn test_dutch_example_with_ë() {
1527        let original = "ongeÃ«venaard";
1528        let expected = "ongeëvenaard";
1529        let result = fix_text(original, None);
1530        assert_eq!(result, expected);
1531    }
1532
1533    #[test]
1534    fn test_negative_indonesian_leetspeak() {
1535        let original = "MÄ£ÄM ÌÑÌ Q £ÄGÌ GÄLÄW ÑÍCH SÖÄ£ ÑÝÄ $ÚÄMÌ Q £ÄGÌ GÄK ÉÑÄK BÄDÄÑ....?????????,                     ......JÄDÍ...";
1536        let expected = "MÄ£ÄM ÌÑÌ Q £ÄGÌ GÄLÄW ÑÍCH SÖÄ£ ÑÝÄ $ÚÄMÌ Q £ÄGÌ GÄK ÉÑÄK BÄDÄÑ....?????????,                     ......JÄDÍ...";
1537        let result = fix_text(original, None);
1538        assert_eq!(result, expected);
1539    }
1540
1541    #[test]
1542    fn test_three_layers_of_utf8_macroman_mixup_in_french() {
1543        let original = "Merci de t‚Äö√†√∂¬¨¬©l‚Äö√†√∂¬¨¬©charger le plug-in Flash Player 8";
1544        let expected = "Merci de télécharger le plug-in Flash Player 8";
1545        let result = fix_text(original, None);
1546        assert_eq!(result, expected);
1547    }
1548
1549    #[test]
1550    fn test_utf8_macroman_mixup_in_french() {
1551        let original = "Merci de bien vouloir activiter le Javascript dans votre navigateur web afin d'en profiter‚Ä¶";
1552        let expected = "Merci de bien vouloir activiter le Javascript dans votre navigateur web afin d'en profiter…";
1553        let result = fix_text(original, None);
1554        assert_eq!(result, expected);
1555    }
1556
1557    #[test]
1558    fn test_italian_utf8_macroman_example_with_ò() {
1559        let original = "Le Vigne di Zam√≤";
1560        let expected = "Le Vigne di Zamò";
1561        let result = fix_text(original, None);
1562        assert_eq!(result, expected);
1563    }
1564
1565    #[test]
1566    fn test_hebrew_utf8_windows_1252_mojibake() {
1567        let original = "×‘×”×•×“×¢×”";
1568        let expected = "בהודעה";
1569        let result = fix_text(original, None);
1570        assert_eq!(result, expected);
1571    }
1572
1573    #[test]
1574    fn test_synthetic_hebrew_utf8_windows_1250_mojibake() {
1575        let original = "×‘×”×•×“×˘×”";
1576        let expected = "בהודעה";
1577        let result = fix_text(original, None);
1578        assert_eq!(result, expected);
1579    }
1580
1581    #[test]
1582    fn test_synthetic_hebrew_utf8_macroman_mojibake() {
1583        let original = "◊ë◊î◊ï◊ì◊¢◊î";
1584        let expected = "בהודעה";
1585        let result = fix_text(original, None);
1586        assert_eq!(result, expected);
1587    }
1588
1589    #[test]
1590    fn test_synthetic_hebrew_utf8_latin1_mojibake() {
1591        let original = "××××";
1592        let expected = "אבבא";
1593        let result = fix_text(original, None);
1594        assert_eq!(result, expected);
1595    }
1596
1597    #[test]
1598    fn test_synthetic_arabic_utf8_windows_1252_mojibake() {
1599        let original = "Ø±Ø³Ø§Ù„Ø©";
1600        let expected = "رسالة";
1601        let result = fix_text(original, None);
1602        assert_eq!(result, expected);
1603    }
1604
1605    #[test]
1606    fn test_synthetic_arabic_utf8_windows_1250_mojibake() {
1607        let original = "Ř±ŘłŘ§Ů„Ř©";
1608        let expected = "رسالة";
1609        let result = fix_text(original, None);
1610        assert_eq!(result, expected);
1611    }
1612
1613    #[test]
1614    fn test_synthetic_arabic_utf8_macroman_mojibake() {
1615        let original = "ÿ±ÿ≥ÿßŸÑÿ©";
1616        let expected = "رسالة";
1617        let result = fix_text(original, None);
1618        assert_eq!(result, expected);
1619    }
1620
1621    #[test]
1622    fn test_negative_math_in_unicode() {
1623        let original = "(-1/2)! = √π";
1624        let expected = "(-1/2)! = √π";
1625        let result = fix_text(original, None);
1626        assert_eq!(result, expected);
1627    }
1628
1629    #[test]
1630    fn test_negative_leet_line_art() {
1631        let original = "├┤a┼┐a┼┐a┼┐a┼┐a";
1632        let expected = "├┤a┼┐a┼┐a┼┐a┼┐a";
1633        let result = fix_text(original, None);
1634        assert_eq!(result, expected);
1635    }
1636
1637    #[test]
1638    fn test_synthetic_negative_brontës_name_does_not_end_with_a_korean_syllable() {
1639        let original = "I'm not such a fan of Charlotte Brontë…”";
1640        let expected = "I'm not such a fan of Charlotte Brontë…\"";
1641        let result = fix_text(original, None);
1642        assert_eq!(result, expected);
1643    }
1644
1645    #[test]
1646    fn test_synthetic_negative_hypothetical_swedish_product_name() {
1647        let original = "AHÅ™, the new sofa from IKEA";
1648        let expected = "AHÅ™, the new sofa from IKEA";
1649        let result = fix_text(original, None);
1650        assert_eq!(result, expected);
1651    }
1652
1653    #[test]
1654    fn test_synthetic_negative_ukrainian_capital_letters() {
1655        let original = "ВІКІ is Ukrainian for WIKI";
1656        let expected = "ВІКІ is Ukrainian for WIKI";
1657        let result = fix_text(original, None);
1658        assert_eq!(result, expected);
1659    }
1660
1661    #[test]
1662    fn test_synthetic_negative_dont_leak_our_internal_use_of_byte_0x1a() {
1663        let original = "These control characters \u{001a} are apparently intentional \u{0081}";
1664        let expected = "These control characters  are apparently intentional \u{0081}";
1665        let result = fix_text(original, None);
1666        assert_eq!(result, expected);
1667    }
1668
1669    #[test]
1670    fn test_synthetic_negative_u1a_on_its_own() {
1671        let original = "Here's a control character: ";
1672        let expected = "Here's a control character: ";
1673        let result = fix_text(original, None);
1674        assert_eq!(result, expected);
1675    }
1676
1677    #[test]
1678    fn test_synthetic_negative_a_with_circle_as_an_angstrom_sign() {
1679        let original = "a radius of 10 Å—";
1680        let expected = "a radius of 10 Å—";
1681        let result = fix_text(original, None);
1682        assert_eq!(result, expected);
1683    }
1684
1685    #[test]
1686    fn test_synthetic_negative_spanish_with_exclamation_points_on_the_wrong_sides() {
1687        let original = "!YO SÉ¡";
1688        let expected = "!YO SÉ¡";
1689        let result = fix_text(original, None);
1690        assert_eq!(result, expected);
1691    }
1692
1693    #[test]
1694    fn test_synthetic_fix_text_with_backslashes_in_it() {
1695        let original = "<40% vs â¥40%";
1696        let expected = "<40% vs ≥40%";
1697        let result = fix_text(original, None);
1698        assert_eq!(result, expected);
1699    }
1700
1701    #[test]
1702    fn test_synthetic_curly_quotes_with_mismatched_encoding_glitches_in_latin1() {
1703        let original = "âmismatched quotes";
1704
1705        let expected = "\"mismatched quotes…\"";
1706        let result = fix_text(original, None);
1707        assert_eq!(result, expected);
1708    }
1709
1710    #[test]
1711    fn test_synthetic_curly_quotes_with_mismatched_encoding_glitches_in_windows_1252() {
1712        let original = "â€œmismatched quotesâ€¦”";
1713        let expected = "\"mismatched quotes…\"";
1714        let result = fix_text(original, None);
1715        assert_eq!(result, expected);
1716    }
1717
1718    #[test]
1719    fn test_synthetic_lossy_decoding_in_sloppy_windows_1252() {
1720        let original = "â€œlossy decodingâ€�";
1721        let expected = "\"lossy decoding�";
1722        let result = fix_text(original, None);
1723        assert_eq!(result, expected);
1724    }
1725
1726    #[test]
1727    fn test_synthetic_french_word_for_august_in_windows_1252() {
1728        let original = "aoÃ»t";
1729        let expected = "août";
1730        let result = fix_text(original, None);
1731        assert_eq!(result, expected);
1732    }
1733
1734    #[test]
1735    fn test_synthetic_french_word_for_hotel_in_all_caps_windows_1252() {
1736        let original = "HÃ”TEL";
1737        let expected = "HÔTEL";
1738        let result = fix_text(original, None);
1739        assert_eq!(result, expected);
1740    }
1741
1742    #[test]
1743    fn test_synthetic_scottish_gaelic_word_for_subject_in_all_caps_windows_1252() {
1744        let original = "CÃ™IS";
1745        let expected = "CÙIS";
1746        let result = fix_text(original, None);
1747        assert_eq!(result, expected);
1748    }
1749
1750    #[test]
1751    fn test_synthetic_negative_romanian_word_before_a_non_breaking_space() {
1752        let original = "NICIODATĂ ";
1753        let expected = "NICIODATĂ ";
1754        let result = fix_text(original, None);
1755        assert_eq!(result, expected);
1756    }
1757
1758    #[test]
1759    fn test_synthetic_negative_be_careful_around_curly_apostrophes() {
1760        let original = "There are a lot of Ã’s in mojibake text";
1761        let expected = "There are a lot of Ã's in mojibake text";
1762        let result = fix_text(original, None);
1763        assert_eq!(result, expected);
1764    }
1765
1766    #[test]
1767    fn test_synthetic_negative_romanian_word_before_a_trademark_sign() {
1768        let original = "NICIODATĂ™";
1769        let expected = "NICIODATĂ™";
1770        let result = fix_text(original, None);
1771        assert_eq!(result, expected);
1772    }
1773
1774    #[test]
1775    fn test_synthetic_negative_camel_cased_serbian_that_looks_like_a_utf8_windows_1251_mixup() {
1776        let original = "ПоздравЂаво";
1777        let expected = "ПоздравЂаво";
1778        let result = fix_text(original, None);
1779        assert_eq!(result, expected);
1780    }
1781
1782    #[test]
1783    fn test_synthetic_mojibake_with_trademark_sign_at_the_end_of_a_word() {
1784        let original = "OÃ™ ET QUAND?";
1785        let expected = "OÙ ET QUAND?";
1786        let result = fix_text(original, None);
1787        assert_eq!(result, expected);
1788    }
1789}
plsfix/lib.rs

plsfix/
lib.rs