harper_core/parsers/
isolate_english.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
use crate::{language_detection::is_likely_english, Dictionary};

use super::{Parser, Token, TokenStringExt};

/// A parser that wraps another, using heuristics to quickly redact paragraphs of a document that aren't
/// intended to be English text.
pub struct IsolateEnglish<D: Dictionary> {
    inner: Box<dyn Parser>,
    dict: D,
}

impl<D: Dictionary> IsolateEnglish<D> {
    pub fn new(inner: Box<dyn Parser>, dictionary: D) -> Self {
        Self {
            inner,
            dict: dictionary,
        }
    }
}

impl<D: Dictionary> Parser for IsolateEnglish<D> {
    fn parse(&mut self, source: &[char]) -> Vec<Token> {
        let tokens = self.inner.parse(source);

        let mut english_tokens: Vec<Token> = Vec::with_capacity(tokens.len());

        for chunk in tokens.iter_chunks() {
            if chunk.len() < 5 || is_likely_english(chunk, source, &self.dict) {
                english_tokens.extend(chunk);
            }
        }

        english_tokens
    }
}

#[cfg(test)]
mod tests {
    use crate::{parsers::PlainEnglish, Document, FstDictionary, TokenStringExt};

    use super::IsolateEnglish;

    /// Assert that the provided text contains _no_ chunks of valid English
    fn assert_no_english(text: &str) {
        let dict = FstDictionary::curated();

        let document = Document::new(
            text,
            &mut IsolateEnglish::new(Box::new(PlainEnglish), dict.clone()),
            &dict,
        );

        assert_eq!(document.iter_words().count(), 0);
        assert_eq!(document.iter_punctuations().count(), 0);
    }

    /// Assert that, once stripped of non-English chunks, the resulting document looks like another
    /// piece of text.
    fn assert_stripped_english(source: &str, target: &str) {
        let dict = FstDictionary::curated();

        let document = Document::new(
            source,
            &mut IsolateEnglish::new(Box::new(PlainEnglish), dict.clone()),
            &dict,
        );

        assert_eq!(document.to_string(), target);
    }

    #[test]
    fn mixed_spanish_english_breakfast() {
        assert_no_english(
            "En la mañana, como a dish de los huevos, un poquito of tocino, y a lot of leche.",
        );
    }

    #[test]
    fn mixed_spanish_english_politics() {
        assert_no_english(
            "No estoy of acuerdo con the politics de Los estados unidos ahora; pienso que we need mas diversidad in el gobierno.",
        );
    }

    #[test]
    fn english_no_edit_motto() {
        assert_stripped_english(
            "I have a simple motto in life: ",
            "I have a simple motto in life: ",
        );
    }

    #[test]
    fn chunked_trad_chinese_english() {
        assert_stripped_english(
            "I have a simple motto in life: 如果你渴了,就喝水。",
            "I have a simple motto in life:",
        );
    }

    #[test]
    fn chunked_trad_polish_english() {
        assert_stripped_english(
            "I have a simple motto in life: jeśli jesteś spragniony, napij się wody.",
            "I have a simple motto in life:",
        );
    }
}