harper_core/
language_detection.rs

1use crate::{Dictionary, Document, Token, TokenKind};
2
3/// Check if the contents of the document are likely intended to represent
4/// English.
5pub fn is_doc_likely_english(doc: &Document, dict: &impl Dictionary) -> bool {
6    is_likely_english(doc.get_tokens(), doc.get_source(), dict)
7}
8
9/// Check if given tokens are likely intended to represent English.
10pub fn is_likely_english(toks: &[Token], source: &[char], dict: &impl Dictionary) -> bool {
11    let mut total_words = 0;
12    let mut valid_words = 0;
13    let mut punctuation = 0;
14    let mut unlintable = 0;
15
16    for token in toks {
17        match token.kind {
18            TokenKind::Word(_) => {
19                total_words += 1;
20
21                let word_content = token.span.get_content(source);
22                if dict.contains_word(word_content) {
23                    valid_words += 1;
24                }
25            }
26            TokenKind::Punctuation(_) => punctuation += 1,
27            TokenKind::Unlintable => unlintable += 1,
28            _ => (),
29        }
30    }
31
32    if total_words <= 7 && total_words - valid_words > 0 {
33        return false;
34    }
35
36    if unlintable > valid_words {
37        return false;
38    }
39
40    if (punctuation as f32 * 1.25) > valid_words as f32 {
41        return false;
42    }
43
44    if (valid_words as f64 / total_words as f64) < 0.7 {
45        return false;
46    }
47
48    true
49}
50
51#[cfg(test)]
52mod tests {
53    use super::is_doc_likely_english;
54    use crate::{Document, FstDictionary};
55
56    fn assert_not_english(source: &'static str) {
57        let dict = FstDictionary::curated();
58        let doc = Document::new_plain_english(source, &dict);
59        let is_likely_english = is_doc_likely_english(&doc, &dict);
60        dbg!(source);
61        assert!(!is_likely_english);
62    }
63
64    fn assert_english(source: &'static str) {
65        let dict = FstDictionary::curated();
66        let doc = Document::new_plain_english(source, &dict);
67        let is_likely_english = is_doc_likely_english(&doc, &dict);
68        dbg!(source);
69        assert!(is_likely_english);
70    }
71
72    #[test]
73    fn detects_spanish() {
74        assert_not_english("Esto es español. Harper no debería marcarlo como inglés.");
75    }
76
77    #[test]
78    fn detects_french() {
79        assert_not_english(
80            "C'est du français. Il ne devrait pas être marqué comme anglais par Harper.",
81        );
82    }
83
84    #[test]
85    fn detects_shebang() {
86        assert_not_english("#! /bin/bash");
87        assert_not_english("#! /usr/bin/fish");
88    }
89
90    #[test]
91    fn detects_short_english() {
92        assert_english("This is English!");
93    }
94
95    #[test]
96    fn detects_english() {
97        assert_english("This is perfectly valid English, evn if it has a cople typos.")
98    }
99
100    #[test]
101    fn detects_expressive_english() {
102        assert_english("Look above! That is real English! So is this: bippity bop!")
103    }
104
105    /// Useful for detecting commented-out code.
106    #[test]
107    fn detects_python_fib() {
108        assert_not_english(
109            r"
110def fibIter(n):
111    if n < 2:
112        return n
113    fibPrev = 1
114    fib = 1
115    for _ in range(2, n):
116        fibPrev, fib = fib, fib + fibPrev
117    return fib
118        ",
119        );
120    }
121
122    #[test]
123    fn mixed_french_english_park() {
124        assert_not_english("Je voudrais promener au the park a huit heures with ma voisine");
125    }
126
127    #[test]
128    fn mixed_french_english_drunk() {
129        assert_not_english("Je ne suis pas drunk, je suis only ivre by you");
130    }
131
132    #[test]
133    fn mixed_french_english_dress() {
134        assert_not_english(
135            "Je buy une robe nouveau chaque Tuesday, mais aujourd'hui, je don't have temps",
136        );
137    }
138
139    #[test]
140    fn english_motto() {
141        assert_english("I have a simple motto in life");
142    }
143}