harper_core/
language_detection.rs1use crate::{Dictionary, Document, Token, TokenKind};
2
3pub fn is_doc_likely_english(doc: &Document, dict: &impl Dictionary) -> bool {
6 is_likely_english(doc.get_tokens(), doc.get_source(), dict)
7}
8
9pub fn is_likely_english(toks: &[Token], source: &[char], dict: &impl Dictionary) -> bool {
11 let mut total_words = 0;
12 let mut valid_words = 0;
13 let mut punctuation = 0;
14 let mut unlintable = 0;
15
16 for token in toks {
17 match token.kind {
18 TokenKind::Word(_) => {
19 total_words += 1;
20
21 let word_content = token.span.get_content(source);
22 if dict.contains_word(word_content) {
23 valid_words += 1;
24 }
25 }
26 TokenKind::Punctuation(_) => punctuation += 1,
27 TokenKind::Unlintable => unlintable += 1,
28 _ => (),
29 }
30 }
31
32 if total_words <= 7 && total_words - valid_words > 0 {
33 return false;
34 }
35
36 if unlintable > valid_words {
37 return false;
38 }
39
40 if (punctuation as f32 * 1.25) > valid_words as f32 {
41 return false;
42 }
43
44 if (valid_words as f64 / total_words as f64) < 0.7 {
45 return false;
46 }
47
48 true
49}
50
51#[cfg(test)]
52mod tests {
53 use super::is_doc_likely_english;
54 use crate::{Document, FstDictionary};
55
56 fn assert_not_english(source: &'static str) {
57 let dict = FstDictionary::curated();
58 let doc = Document::new_plain_english(source, &dict);
59 let is_likely_english = is_doc_likely_english(&doc, &dict);
60 dbg!(source);
61 assert!(!is_likely_english);
62 }
63
64 fn assert_english(source: &'static str) {
65 let dict = FstDictionary::curated();
66 let doc = Document::new_plain_english(source, &dict);
67 let is_likely_english = is_doc_likely_english(&doc, &dict);
68 dbg!(source);
69 assert!(is_likely_english);
70 }
71
72 #[test]
73 fn detects_spanish() {
74 assert_not_english("Esto es español. Harper no debería marcarlo como inglés.");
75 }
76
77 #[test]
78 fn detects_french() {
79 assert_not_english(
80 "C'est du français. Il ne devrait pas être marqué comme anglais par Harper.",
81 );
82 }
83
84 #[test]
85 fn detects_shebang() {
86 assert_not_english("#! /bin/bash");
87 assert_not_english("#! /usr/bin/fish");
88 }
89
90 #[test]
91 fn detects_short_english() {
92 assert_english("This is English!");
93 }
94
95 #[test]
96 fn detects_english() {
97 assert_english("This is perfectly valid English, evn if it has a cople typos.")
98 }
99
100 #[test]
101 fn detects_expressive_english() {
102 assert_english("Look above! That is real English! So is this: bippity bop!")
103 }
104
105 #[test]
107 fn detects_python_fib() {
108 assert_not_english(
109 r"
110def fibIter(n):
111 if n < 2:
112 return n
113 fibPrev = 1
114 fib = 1
115 for _ in range(2, n):
116 fibPrev, fib = fib, fib + fibPrev
117 return fib
118 ",
119 );
120 }
121
122 #[test]
123 fn mixed_french_english_park() {
124 assert_not_english("Je voudrais promener au the park a huit heures with ma voisine");
125 }
126
127 #[test]
128 fn mixed_french_english_drunk() {
129 assert_not_english("Je ne suis pas drunk, je suis only ivre by you");
130 }
131
132 #[test]
133 fn mixed_french_english_dress() {
134 assert_not_english(
135 "Je buy une robe nouveau chaque Tuesday, mais aujourd'hui, je don't have temps",
136 );
137 }
138
139 #[test]
140 fn english_motto() {
141 assert_english("I have a simple motto in life");
142 }
143}