harper_core/patterns/
word_set.rs

1use super::SingleTokenPattern;
2use smallvec::SmallVec;
3
4use crate::{CharString, Token};
5
6/// A [`super::Pattern`] that matches against any of a set of provided words.
7/// For small sets of short words, it doesn't allocate.
8///
9/// Note that any capitalization of the contained words will result in a match.
10#[derive(Debug, Default, Clone)]
11pub struct WordSet {
12    words: SmallVec<[CharString; 4]>,
13}
14
15impl WordSet {
16    pub fn add(&mut self, word: &str) {
17        let chars = word.chars().collect();
18
19        if !self.words.contains(&chars) {
20            self.words.push(chars);
21        }
22    }
23
24    pub fn contains(&self, word: &str) -> bool {
25        self.words.contains(&word.chars().collect())
26    }
27
28    /// Create a new word set that matches against any word in the provided list.
29    pub fn new(words: &[&'static str]) -> Self {
30        let mut set = Self::default();
31
32        for str in words {
33            set.add(str);
34        }
35
36        set
37    }
38}
39
40impl SingleTokenPattern for WordSet {
41    fn matches_token(&self, token: &Token, source: &[char]) -> bool {
42        if !token.kind.is_word() {
43            return false;
44        }
45
46        let tok_chars = token.span.get_content(source);
47
48        for word in &self.words {
49            if tok_chars.len() != word.len() {
50                continue;
51            }
52
53            fn canonical(c: &char) -> char {
54                match c {
55                    '\u{2018}' | '\u{2019}' | '\u{02BC}' | '\u{FF07}' => '\'',
56                    '\u{201C}' | '\u{201D}' | '\u{FF02}' => '"',
57                    '\u{2013}' | '\u{2014}' | '\u{2212}' | '\u{FF0D}' => '-',
58                    _ => *c,
59                }
60            }
61
62            let partial_match = tok_chars
63                .iter()
64                .map(canonical)
65                .zip(word.iter().map(canonical))
66                .all(|(a, b)| a.eq_ignore_ascii_case(&b));
67
68            if partial_match {
69                return true;
70            }
71        }
72
73        false
74    }
75}
76
77#[cfg(test)]
78mod tests {
79    use crate::{Document, Span, patterns::DocPattern};
80
81    use super::WordSet;
82
83    #[test]
84    fn fruit() {
85        let set = WordSet::new(&["banana", "apple", "orange"]);
86
87        let doc = Document::new_markdown_default_curated("I ate a banana and an apple today.");
88
89        let matches = set.find_all_matches_in_doc(&doc);
90
91        assert_eq!(matches, vec![Span::new(6, 7), Span::new(12, 13)]);
92    }
93
94    #[test]
95    fn fruit_whack_capitalization() {
96        let set = WordSet::new(&["banana", "apple", "orange"]);
97
98        let doc = Document::new_markdown_default_curated("I Ate A bAnaNa And aN apPlE today.");
99
100        let matches = set.find_all_matches_in_doc(&doc);
101
102        assert_eq!(matches, vec![Span::new(6, 7), Span::new(12, 13)]);
103    }
104
105    #[test]
106    fn supports_typographic_apostrophes() {
107        let set = WordSet::new(&["They're"]);
108
109        let doc = Document::new_markdown_default_curated("They’re");
110
111        let matches = set.find_all_matches_in_doc(&doc);
112
113        assert_eq!(matches, vec![Span::new(0, 1)]);
114    }
115}