harper_core/patterns/
word_set.rs1use super::SingleTokenPattern;
2use smallvec::SmallVec;
3
4use crate::{CharString, Token};
5
6#[derive(Debug, Default, Clone)]
11pub struct WordSet {
12 words: SmallVec<[CharString; 4]>,
13}
14
15impl WordSet {
16 pub fn add(&mut self, word: &str) {
17 let chars = word.chars().collect();
18
19 if !self.words.contains(&chars) {
20 self.words.push(chars);
21 }
22 }
23
24 pub fn add_chars(&mut self, chars: &[char]) {
25 if !self.words.iter().any(|i| i.as_ref() == chars) {
26 self.words.push(chars.into());
27 }
28 }
29
30 pub fn contains(&self, word: &str) -> bool {
31 self.words.contains(&word.chars().collect())
32 }
33
34 pub fn new(words: &[&'static str]) -> Self {
36 let mut set = Self::default();
37
38 for str in words {
39 set.add(str);
40 }
41
42 set
43 }
44}
45
46impl SingleTokenPattern for WordSet {
47 fn matches_token(&self, token: &Token, source: &[char]) -> bool {
48 if !token.kind.is_word() {
49 return false;
50 }
51
52 let tok_chars = token.span.get_content(source);
53
54 for word in &self.words {
55 if tok_chars.len() != word.len() {
56 continue;
57 }
58
59 fn canonical(c: &char) -> char {
60 match c {
61 '\u{2018}' | '\u{2019}' | '\u{02BC}' | '\u{FF07}' => '\'',
62 '\u{201C}' | '\u{201D}' | '\u{FF02}' => '"',
63 '\u{2013}' | '\u{2014}' | '\u{2212}' | '\u{FF0D}' => '-',
64 _ => *c,
65 }
66 }
67
68 let partial_match = tok_chars
69 .iter()
70 .map(canonical)
71 .zip(word.iter().map(canonical))
72 .all(|(a, b)| a.eq_ignore_ascii_case(&b));
73
74 if partial_match {
75 return true;
76 }
77 }
78
79 false
80 }
81}
82
83#[cfg(test)]
84mod tests {
85 use crate::{Document, Span, patterns::DocPattern};
86
87 use super::WordSet;
88
89 #[test]
90 fn fruit() {
91 let set = WordSet::new(&["banana", "apple", "orange"]);
92
93 let doc = Document::new_markdown_default_curated("I ate a banana and an apple today.");
94
95 let matches = set.find_all_matches_in_doc(&doc);
96
97 assert_eq!(matches, vec![Span::new(6, 7), Span::new(12, 13)]);
98 }
99
100 #[test]
101 fn fruit_whack_capitalization() {
102 let set = WordSet::new(&["banana", "apple", "orange"]);
103
104 let doc = Document::new_markdown_default_curated("I Ate A bAnaNa And aN apPlE today.");
105
106 let matches = set.find_all_matches_in_doc(&doc);
107
108 assert_eq!(matches, vec![Span::new(6, 7), Span::new(12, 13)]);
109 }
110
111 #[test]
112 fn supports_typographic_apostrophes() {
113 let set = WordSet::new(&["They're"]);
114
115 let doc = Document::new_markdown_default_curated("They’re");
116
117 let matches = set.find_all_matches_in_doc(&doc);
118
119 assert_eq!(matches, vec![Span::new(0, 1)]);
120 }
121}