harper_core/parsers/
isolate_english.rs

1use super::{Parser, Token, TokenStringExt};
2use crate::language_detection::is_likely_english;
3use crate::spell::Dictionary;
4
5/// A parser that wraps another, using heuristics to quickly redact paragraphs of a document that aren't
6/// intended to be English text.
7pub struct IsolateEnglish<D: Dictionary> {
8    inner: Box<dyn Parser>,
9    dict: D,
10}
11
12impl<D: Dictionary> IsolateEnglish<D> {
13    pub fn new(inner: Box<dyn Parser>, dictionary: D) -> Self {
14        Self {
15            inner,
16            dict: dictionary,
17        }
18    }
19}
20
21impl<D: Dictionary> Parser for IsolateEnglish<D> {
22    fn parse(&self, source: &[char]) -> Vec<Token> {
23        let tokens = self.inner.parse(source);
24
25        let mut english_tokens: Vec<Token> = Vec::with_capacity(tokens.len());
26
27        for chunk in tokens.iter_chunks() {
28            if chunk.len() < 4 || is_likely_english(chunk, source, &self.dict) {
29                english_tokens.extend_from_slice(chunk);
30            }
31        }
32
33        english_tokens
34    }
35}
36
37#[cfg(test)]
38mod tests {
39    use super::IsolateEnglish;
40    use crate::spell::FstDictionary;
41    use crate::{Document, TokenStringExt, parsers::PlainEnglish};
42
43    /// Assert that the provided text contains _no_ chunks of valid English
44    fn assert_no_english(text: &str) {
45        let dict = FstDictionary::curated();
46
47        let document = Document::new(
48            text,
49            &IsolateEnglish::new(Box::new(PlainEnglish), dict.clone()),
50            &dict,
51        );
52
53        assert_eq!(document.iter_words().count(), 0);
54        assert_eq!(document.iter_punctuations().count(), 0);
55    }
56
57    /// Assert that, once stripped of non-English chunks, the resulting document looks like another
58    /// piece of text.
59    fn assert_stripped_english(source: &str, target: &str) {
60        let dict = FstDictionary::curated();
61
62        let document = Document::new(
63            source,
64            &IsolateEnglish::new(Box::new(PlainEnglish), dict.clone()),
65            &dict,
66        );
67
68        assert_eq!(document.to_string(), target);
69    }
70
71    #[test]
72    fn mixed_spanish_english_breakfast() {
73        assert_no_english(
74            "En la mañana, como a dish de los huevos, un poquito of tocino, y a lot of leche.",
75        );
76    }
77
78    #[test]
79    fn mixed_spanish_english_politics() {
80        assert_no_english(
81            "No estoy of acuerdo con the politics de Los estados unidos ahora; pienso que we need mas diversidad in el gobierno.",
82        );
83    }
84
85    #[test]
86    fn english_no_edit_motto() {
87        assert_stripped_english(
88            "I have a simple motto in life: ",
89            "I have a simple motto in life: ",
90        );
91    }
92
93    #[test]
94    fn chunked_trad_chinese_english() {
95        assert_stripped_english(
96            "I have a simple motto in life: 如果你渴了,就喝水。",
97            "I have a simple motto in life:",
98        );
99    }
100
101    #[test]
102    fn chunked_trad_polish_english() {
103        assert_stripped_english(
104            "I have a simple motto in life: jeśli jesteś spragniony, napij się wody.",
105            "I have a simple motto in life:",
106        );
107    }
108}