harper_core/
language_detection.rs

1//! This module implements rudimentary, dictionary-based English language detection.
2
3use crate::spell::Dictionary;
4use crate::{Document, Token, TokenKind};
5
6/// Check if the contents of the document are likely intended to represent
7/// English.
8pub fn is_doc_likely_english(doc: &Document, dict: &impl Dictionary) -> bool {
9    is_likely_english(doc.get_tokens(), doc.get_source(), dict)
10}
11
12/// Check if given tokens are likely intended to represent English.
13pub fn is_likely_english(toks: &[Token], source: &[char], dict: &impl Dictionary) -> bool {
14    let mut total_words = 0;
15    let mut valid_words = 0;
16    let mut punctuation = 0;
17    let mut unlintable = 0;
18
19    for token in toks {
20        match token.kind {
21            TokenKind::Word(_) => {
22                total_words += 1;
23
24                let word_content = token.span.get_content(source);
25                if dict.contains_word(word_content) {
26                    valid_words += 1;
27                }
28            }
29            TokenKind::Punctuation(_) => punctuation += 1,
30            TokenKind::Unlintable => unlintable += 1,
31            _ => (),
32        }
33    }
34
35    if total_words <= 7 && total_words - valid_words > 0 {
36        return false;
37    }
38
39    if unlintable > valid_words {
40        return false;
41    }
42
43    if (punctuation as f32 * 1.25) > valid_words as f32 {
44        return false;
45    }
46
47    if (valid_words as f64 / total_words as f64) < 0.7 {
48        return false;
49    }
50
51    true
52}
53
54#[cfg(test)]
55mod tests {
56    use super::is_doc_likely_english;
57    use crate::Document;
58    use crate::spell::FstDictionary;
59
60    fn assert_not_english(source: &'static str) {
61        let dict = FstDictionary::curated();
62        let doc = Document::new_plain_english(source, &dict);
63        let is_likely_english = is_doc_likely_english(&doc, &dict);
64        dbg!(source);
65        assert!(!is_likely_english);
66    }
67
68    fn assert_english(source: &'static str) {
69        let dict = FstDictionary::curated();
70        let doc = Document::new_plain_english(source, &dict);
71        let is_likely_english = is_doc_likely_english(&doc, &dict);
72        dbg!(source);
73        assert!(is_likely_english);
74    }
75
76    #[test]
77    fn detects_spanish() {
78        assert_not_english("Esto es español. Harper no debería marcarlo como inglés.");
79    }
80
81    #[test]
82    fn detects_french() {
83        assert_not_english(
84            "C'est du français. Il ne devrait pas être marqué comme anglais par Harper.",
85        );
86    }
87
88    #[test]
89    fn detects_shebang() {
90        assert_not_english("#! /bin/bash");
91        assert_not_english("#! /usr/bin/fish");
92    }
93
94    #[test]
95    fn detects_short_english() {
96        assert_english("This is English!");
97    }
98
99    #[test]
100    fn detects_english() {
101        assert_english("This is perfectly valid English, evn if it has a cople typos.")
102    }
103
104    #[test]
105    fn detects_expressive_english() {
106        assert_english("Look above! That is real English! So is this: bippity bop!")
107    }
108
109    /// Useful for detecting commented-out code.
110    #[test]
111    fn detects_python_fib() {
112        assert_not_english(
113            r"
114def fibIter(n):
115    if n < 2:
116        return n
117    fibPrev = 1
118    fib = 1
119    for _ in range(2, n):
120        fibPrev, fib = fib, fib + fibPrev
121    return fib
122        ",
123        );
124    }
125
126    #[test]
127    fn mixed_french_english_park() {
128        assert_not_english("Je voudrais promener au the park a huit heures with ma voisine");
129    }
130
131    #[test]
132    fn mixed_french_english_drunk() {
133        assert_not_english("Je ne suis pas drunk, je suis only ivre by you");
134    }
135
136    #[test]
137    fn mixed_french_english_dress() {
138        assert_not_english(
139            "Je buy une robe nouveau chaque Tuesday, mais aujourd'hui, je don't have temps",
140        );
141    }
142
143    #[test]
144    fn english_motto() {
145        assert_english("I have a simple motto in life");
146    }
147}