harper_core/
language_detection.rs1use crate::spell::Dictionary;
4use crate::{Document, Token, TokenKind};
5
6pub fn is_doc_likely_english(doc: &Document, dict: &impl Dictionary) -> bool {
9 is_likely_english(doc.get_tokens(), doc.get_source(), dict)
10}
11
12pub fn is_likely_english(toks: &[Token], source: &[char], dict: &impl Dictionary) -> bool {
14 let mut total_words = 0;
15 let mut valid_words = 0;
16 let mut punctuation = 0;
17 let mut unlintable = 0;
18
19 for token in toks {
20 match token.kind {
21 TokenKind::Word(_) => {
22 total_words += 1;
23
24 let word_content = token.span.get_content(source);
25 if dict.contains_word(word_content) {
26 valid_words += 1;
27 }
28 }
29 TokenKind::Punctuation(_) => punctuation += 1,
30 TokenKind::Unlintable => unlintable += 1,
31 _ => (),
32 }
33 }
34
35 if total_words <= 7 && total_words - valid_words > 0 {
36 return false;
37 }
38
39 if unlintable > valid_words {
40 return false;
41 }
42
43 if (punctuation as f32 * 1.25) > valid_words as f32 {
44 return false;
45 }
46
47 if (valid_words as f64 / total_words as f64) < 0.7 {
48 return false;
49 }
50
51 true
52}
53
54#[cfg(test)]
55mod tests {
56 use super::is_doc_likely_english;
57 use crate::Document;
58 use crate::spell::FstDictionary;
59
60 fn assert_not_english(source: &'static str) {
61 let dict = FstDictionary::curated();
62 let doc = Document::new_plain_english(source, &dict);
63 let is_likely_english = is_doc_likely_english(&doc, &dict);
64 dbg!(source);
65 assert!(!is_likely_english);
66 }
67
68 fn assert_english(source: &'static str) {
69 let dict = FstDictionary::curated();
70 let doc = Document::new_plain_english(source, &dict);
71 let is_likely_english = is_doc_likely_english(&doc, &dict);
72 dbg!(source);
73 assert!(is_likely_english);
74 }
75
76 #[test]
77 fn detects_spanish() {
78 assert_not_english("Esto es español. Harper no debería marcarlo como inglés.");
79 }
80
81 #[test]
82 fn detects_french() {
83 assert_not_english(
84 "C'est du français. Il ne devrait pas être marqué comme anglais par Harper.",
85 );
86 }
87
88 #[test]
89 fn detects_shebang() {
90 assert_not_english("#! /bin/bash");
91 assert_not_english("#! /usr/bin/fish");
92 }
93
94 #[test]
95 fn detects_short_english() {
96 assert_english("This is English!");
97 }
98
99 #[test]
100 fn detects_english() {
101 assert_english("This is perfectly valid English, evn if it has a cople typos.")
102 }
103
104 #[test]
105 fn detects_expressive_english() {
106 assert_english("Look above! That is real English! So is this: bippity bop!")
107 }
108
109 #[test]
111 fn detects_python_fib() {
112 assert_not_english(
113 r"
114def fibIter(n):
115 if n < 2:
116 return n
117 fibPrev = 1
118 fib = 1
119 for _ in range(2, n):
120 fibPrev, fib = fib, fib + fibPrev
121 return fib
122 ",
123 );
124 }
125
126 #[test]
127 fn mixed_french_english_park() {
128 assert_not_english("Je voudrais promener au the park a huit heures with ma voisine");
129 }
130
131 #[test]
132 fn mixed_french_english_drunk() {
133 assert_not_english("Je ne suis pas drunk, je suis only ivre by you");
134 }
135
136 #[test]
137 fn mixed_french_english_dress() {
138 assert_not_english(
139 "Je buy une robe nouveau chaque Tuesday, mais aujourd'hui, je don't have temps",
140 );
141 }
142
143 #[test]
144 fn english_motto() {
145 assert_english("I have a simple motto in life");
146 }
147}