harper_core/patterns/
word_set.rs

1use super::SingleTokenPattern;
2use smallvec::SmallVec;
3
4use crate::{CharString, Token, char_ext::CharExt};
5
6/// A [`super::Pattern`] that matches against any of a set of provided words.
7/// For small sets of short words, it doesn't allocate.
8///
9/// Note that any capitalization of the contained words will result in a match.
10#[derive(Debug, Default, Clone)]
11pub struct WordSet {
12    words: SmallVec<[CharString; 4]>,
13}
14
15impl WordSet {
16    pub fn add(&mut self, word: &str) {
17        let chars = word.chars().collect();
18
19        if !self.words.contains(&chars) {
20            self.words.push(chars);
21        }
22    }
23
24    pub fn add_chars(&mut self, chars: &[char]) {
25        if !self.words.iter().any(|i| i.as_ref() == chars) {
26            self.words.push(chars.into());
27        }
28    }
29
30    pub fn contains(&self, word: &str) -> bool {
31        self.words.contains(&word.chars().collect())
32    }
33
34    /// Create a new word set that matches against any word in the provided list.
35    pub fn new(words: &[&'static str]) -> Self {
36        let mut set = Self::default();
37
38        for str in words {
39            set.add(str);
40        }
41
42        set
43    }
44}
45
46impl SingleTokenPattern for WordSet {
47    fn matches_token(&self, token: &Token, source: &[char]) -> bool {
48        if !token.kind.is_word() {
49            return false;
50        }
51
52        let tok_chars = token.span.get_content(source);
53
54        for word in &self.words {
55            if tok_chars.len() != word.len() {
56                continue;
57            }
58
59            let partial_match = tok_chars
60                .iter()
61                .map(CharExt::normalized)
62                .zip(word.iter().map(CharExt::normalized))
63                .all(|(a, b)| a.eq_ignore_ascii_case(&b));
64
65            if partial_match {
66                return true;
67            }
68        }
69
70        false
71    }
72}
73
74#[cfg(test)]
75mod tests {
76    use crate::{Document, Span, patterns::DocPattern};
77
78    use super::WordSet;
79
80    #[test]
81    fn fruit() {
82        let set = WordSet::new(&["banana", "apple", "orange"]);
83
84        let doc = Document::new_markdown_default_curated("I ate a banana and an apple today.");
85
86        let matches = set.find_all_matches_in_doc(&doc);
87
88        assert_eq!(matches, vec![Span::new(6, 7), Span::new(12, 13)]);
89    }
90
91    #[test]
92    fn fruit_whack_capitalization() {
93        let set = WordSet::new(&["banana", "apple", "orange"]);
94
95        let doc = Document::new_markdown_default_curated("I Ate A bAnaNa And aN apPlE today.");
96
97        let matches = set.find_all_matches_in_doc(&doc);
98
99        assert_eq!(matches, vec![Span::new(6, 7), Span::new(12, 13)]);
100    }
101
102    #[test]
103    fn supports_typographic_apostrophes() {
104        let set = WordSet::new(&["They're"]);
105
106        let doc = Document::new_markdown_default_curated("They’re");
107
108        let matches = set.find_all_matches_in_doc(&doc);
109
110        assert_eq!(matches, vec![Span::new(0, 1)]);
111    }
112}