harper_core/parsers/
isolate_english.rs1use super::{Parser, Token, TokenStringExt};
2use crate::language_detection::is_likely_english;
3use crate::spell::Dictionary;
4
5pub struct IsolateEnglish<D: Dictionary> {
8 inner: Box<dyn Parser>,
9 dict: D,
10}
11
12impl<D: Dictionary> IsolateEnglish<D> {
13 pub fn new(inner: Box<dyn Parser>, dictionary: D) -> Self {
14 Self {
15 inner,
16 dict: dictionary,
17 }
18 }
19}
20
21impl<D: Dictionary> Parser for IsolateEnglish<D> {
22 fn parse(&self, source: &[char]) -> Vec<Token> {
23 let tokens = self.inner.parse(source);
24
25 let mut english_tokens: Vec<Token> = Vec::with_capacity(tokens.len());
26
27 for chunk in tokens.iter_chunks() {
28 if chunk.len() < 4 || is_likely_english(chunk, source, &self.dict) {
29 english_tokens.extend_from_slice(chunk);
30 }
31 }
32
33 english_tokens
34 }
35}
36
37#[cfg(test)]
38mod tests {
39 use super::IsolateEnglish;
40 use crate::spell::FstDictionary;
41 use crate::{Document, TokenStringExt, parsers::PlainEnglish};
42
43 fn assert_no_english(text: &str) {
45 let dict = FstDictionary::curated();
46
47 let document = Document::new(
48 text,
49 &IsolateEnglish::new(Box::new(PlainEnglish), dict.clone()),
50 &dict,
51 );
52
53 assert_eq!(document.iter_words().count(), 0);
54 assert_eq!(document.iter_punctuations().count(), 0);
55 }
56
57 fn assert_stripped_english(source: &str, target: &str) {
60 let dict = FstDictionary::curated();
61
62 let document = Document::new(
63 source,
64 &IsolateEnglish::new(Box::new(PlainEnglish), dict.clone()),
65 &dict,
66 );
67
68 assert_eq!(document.to_string(), target);
69 }
70
71 #[test]
72 fn mixed_spanish_english_breakfast() {
73 assert_no_english(
74 "En la mañana, como a dish de los huevos, un poquito of tocino, y a lot of leche.",
75 );
76 }
77
78 #[test]
79 fn mixed_spanish_english_politics() {
80 assert_no_english(
81 "No estoy of acuerdo con the politics de Los estados unidos ahora; pienso que we need mas diversidad in el gobierno.",
82 );
83 }
84
85 #[test]
86 fn english_no_edit_motto() {
87 assert_stripped_english(
88 "I have a simple motto in life: ",
89 "I have a simple motto in life: ",
90 );
91 }
92
93 #[test]
94 fn chunked_trad_chinese_english() {
95 assert_stripped_english(
96 "I have a simple motto in life: 如果你渴了,就喝水。",
97 "I have a simple motto in life:",
98 );
99 }
100
101 #[test]
102 fn chunked_trad_polish_english() {
103 assert_stripped_english(
104 "I have a simple motto in life: jeśli jesteś spragniony, napij się wody.",
105 "I have a simple motto in life:",
106 );
107 }
108}