harper_core/patterns/
word_set.rs

1use super::SingleTokenPattern;
2use smallvec::SmallVec;
3
4use crate::{CharString, Token};
5
6/// A [`super::Pattern`] that matches against any of a set of provided words.
7/// For small sets of short words, it doesn't allocate.
8///
9/// Note that any capitalization of the contained words will result in a match.
10#[derive(Debug, Default, Clone)]
11pub struct WordSet {
12    words: SmallVec<[CharString; 4]>,
13}
14
15impl WordSet {
16    pub fn add(&mut self, word: &str) {
17        let chars = word.chars().collect();
18
19        if !self.words.contains(&chars) {
20            self.words.push(chars);
21        }
22    }
23
24    pub fn add_chars(&mut self, chars: &[char]) {
25        if !self.words.iter().any(|i| i.as_ref() == chars) {
26            self.words.push(chars.into());
27        }
28    }
29
30    pub fn contains(&self, word: &str) -> bool {
31        self.words.contains(&word.chars().collect())
32    }
33
34    /// Create a new word set that matches against any word in the provided list.
35    pub fn new(words: &[&'static str]) -> Self {
36        let mut set = Self::default();
37
38        for str in words {
39            set.add(str);
40        }
41
42        set
43    }
44}
45
46impl SingleTokenPattern for WordSet {
47    fn matches_token(&self, token: &Token, source: &[char]) -> bool {
48        if !token.kind.is_word() {
49            return false;
50        }
51
52        let tok_chars = token.span.get_content(source);
53
54        for word in &self.words {
55            if tok_chars.len() != word.len() {
56                continue;
57            }
58
59            fn canonical(c: &char) -> char {
60                match c {
61                    '\u{2018}' | '\u{2019}' | '\u{02BC}' | '\u{FF07}' => '\'',
62                    '\u{201C}' | '\u{201D}' | '\u{FF02}' => '"',
63                    '\u{2013}' | '\u{2014}' | '\u{2212}' | '\u{FF0D}' => '-',
64                    _ => *c,
65                }
66            }
67
68            let partial_match = tok_chars
69                .iter()
70                .map(canonical)
71                .zip(word.iter().map(canonical))
72                .all(|(a, b)| a.eq_ignore_ascii_case(&b));
73
74            if partial_match {
75                return true;
76            }
77        }
78
79        false
80    }
81}
82
83#[cfg(test)]
84mod tests {
85    use crate::{Document, Span, patterns::DocPattern};
86
87    use super::WordSet;
88
89    #[test]
90    fn fruit() {
91        let set = WordSet::new(&["banana", "apple", "orange"]);
92
93        let doc = Document::new_markdown_default_curated("I ate a banana and an apple today.");
94
95        let matches = set.find_all_matches_in_doc(&doc);
96
97        assert_eq!(matches, vec![Span::new(6, 7), Span::new(12, 13)]);
98    }
99
100    #[test]
101    fn fruit_whack_capitalization() {
102        let set = WordSet::new(&["banana", "apple", "orange"]);
103
104        let doc = Document::new_markdown_default_curated("I Ate A bAnaNa And aN apPlE today.");
105
106        let matches = set.find_all_matches_in_doc(&doc);
107
108        assert_eq!(matches, vec![Span::new(6, 7), Span::new(12, 13)]);
109    }
110
111    #[test]
112    fn supports_typographic_apostrophes() {
113        let set = WordSet::new(&["They're"]);
114
115        let doc = Document::new_markdown_default_curated("They’re");
116
117        let matches = set.find_all_matches_in_doc(&doc);
118
119        assert_eq!(matches, vec![Span::new(0, 1)]);
120    }
121}