ares/checkers/
english.rs

1use crate::checkers::checker_result::CheckResult;
2use crate::storage;
3use lemmeknow::Identifier;
4use log::{debug, trace};
5
6use crate::checkers::checker_type::{Check, Checker};
7
8/// Checks English plaintext.
9pub struct EnglishChecker;
10
11/// given an input, check every item in the array and return true if any of them match
12impl Check for Checker<EnglishChecker> {
13    fn new() -> Self {
14        Checker {
15            name: "English Checker",
16            description: "Checks for english words",
17            link: "https://en.wikipedia.org/wiki/List_of_English_words",
18            tags: vec!["english"],
19            expected_runtime: 0.1,
20            /// English is the most popular language
21            popularity: 1.0,
22            lemmeknow_config: Identifier::default(),
23            _phantom: std::marker::PhantomData,
24        }
25    }
26
27    fn check(&self, input: &str) -> CheckResult {
28        let original_input = input;
29        // Normalise the string
30        let input = normalise_string(input);
31        trace!("Checking English for sentence {}", input);
32        /// If 40% of the words are in the english list, then we consider it english.
33        /// This is the threshold at which we consider it english.
34        /// TODO: Do we want to put this into a config somewhere?
35        const PLAINTEXT_DETECTION_PERCENTAGE: f64 = 0.4;
36        let mut words_found: f64 = 0.0;
37
38        // TODO: Change this when the below bugs are fixed.
39        let filename = "English text";
40
41        let mut result = CheckResult {
42            is_identified: false,
43            text: original_input.to_string(),
44            checker_name: self.name,
45            checker_description: self.description,
46            description: filename.to_string(),
47            link: self.link,
48        };
49
50        // After we've normalised our string, if we find it's a length 0 we don't do anything
51        // This can happen if our string is a single puncuation mark, for example.
52        if input.is_empty() {
53            return result;
54        }
55
56        let split_input = input.split(' ');
57
58        // loop through all the words in the input
59        for word in split_input {
60            // if the word is in the english list, then we consider it english
61            // TODO: I think the below function iterates through each dictionary in turn.
62            // Which means it'll try English.txt, then rockyou.txt etc
63            // This is inefficient and makes it harder to compute what dictionary the word came from.
64            // We should probably just use a single dictionary and assign the filenames to the values in the dictionary.
65            // Like {"hello": "English.txt"} etc.
66            // If we're using muiltiple dictionaries we may also have duplicated words which is inefficient.
67            if storage::DICTIONARIES
68                .iter()
69                .any(|(_, words)| words.contains(word))
70            {
71                trace!("Found word {} in English", word);
72                words_found += 1.0;
73            }
74
75            trace!(
76                "Checking word {} with words_found {} and input length: {}",
77                word,
78                words_found,
79                input.len()
80            );
81            // TODO: We are also typecasting to f64 instead of usize, which costs CPU cycles.
82            if words_found / (input.split(' ').count()) as f64 > PLAINTEXT_DETECTION_PERCENTAGE {
83                debug!("Found {} words in {}", words_found, original_input);
84                debug!(
85                    "Returning from English chekcer successfully with {}",
86                    original_input
87                );
88                result.is_identified = true;
89                break;
90            }
91        }
92
93        result
94    }
95}
96
97///! Strings look funny, they might have commas, be uppercase etc
98///! This normalises the string so English checker can work on it
99///! In particular it:
100///! Removes puncuation from the string
101///! Lowercases the string
102fn normalise_string(input: &str) -> String {
103    // The replace function supports patterns https://doc.rust-lang.org/std/str/pattern/trait.Pattern.html#impl-Pattern%3C%27a%3E-3
104    // TODO add more puncuation
105    input
106        .to_ascii_lowercase()
107        .chars()
108        .filter(|x| !x.is_ascii_punctuation())
109        .collect()
110}
111
112#[cfg(test)]
113mod tests {
114    use crate::checkers::english::normalise_string;
115    use crate::checkers::{
116        checker_type::{Check, Checker},
117        english::EnglishChecker,
118    };
119
120    #[test]
121    fn test_check_basic() {
122        let checker = Checker::<EnglishChecker>::new();
123        assert!(checker.check("preinterview").is_identified);
124    }
125
126    #[test]
127    fn test_check_basic2() {
128        let checker = Checker::<EnglishChecker>::new();
129        assert!(checker.check("and").is_identified);
130    }
131
132    #[test]
133    fn test_check_multiple_words() {
134        let checker = Checker::<EnglishChecker>::new();
135        assert!(checker.check("zzz zu'lkadah zenelophon").is_identified);
136    }
137
138    #[test]
139    fn test_check_non_dictionary_word() {
140        let checker = Checker::<EnglishChecker>::new();
141        assert!(
142            !checker
143                .check("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaBabyShark")
144                .is_identified
145        );
146    }
147
148    #[test]
149    fn test_check_multiple_words2() {
150        let checker = Checker::<EnglishChecker>::new();
151        assert!(checker.check("preinterview hello dog").is_identified);
152    }
153    #[test]
154    fn test_check_normalise_string_works_with_lowercasing() {
155        let x = normalise_string("Hello Dear");
156        assert_eq!(x, "hello dear")
157    }
158    #[test]
159    fn test_check_normalise_string_works_with_puncuation() {
160        let x = normalise_string("Hello, Dear");
161        assert_eq!(x, "hello dear")
162    }
163    #[test]
164    fn test_check_normalise_string_works_with_messy_puncuation() {
165        let x = normalise_string(".He/ll?O, Dea!r");
166        assert_eq!(x, "hello dear")
167    }
168
169    #[test]
170    fn test_checker_works_with_puncuation_and_lowercase() {
171        let checker = Checker::<EnglishChecker>::new();
172        assert!(checker.check("Prei?nterview He!llo Dog?").is_identified);
173    }
174
175    #[test]
176    fn test_checker_fails_doesnt_hit_40_percent() {
177        let checker = Checker::<EnglishChecker>::new();
178        assert!(
179            !checker
180                .check("Hello Dog nnnnnnnnnnn llllllll ppppppppp gggggggg")
181                .is_identified
182        );
183    }
184
185    #[test]
186    fn test_check_fail_single_puncuation_char() {
187        let checker = Checker::<EnglishChecker>::new();
188        assert!(!checker.check("#").is_identified);
189    }
190}