harper_core/patterns/
word_set.rs1use super::SingleTokenPattern;
2use smallvec::SmallVec;
3
4use crate::{CharString, Token};
5
6#[derive(Debug, Default, Clone)]
11pub struct WordSet {
12 words: SmallVec<[CharString; 4]>,
13}
14
15impl WordSet {
16 pub fn add(&mut self, word: &str) {
17 let chars = word.chars().collect();
18
19 if !self.words.contains(&chars) {
20 self.words.push(chars);
21 }
22 }
23
24 pub fn contains(&self, word: &str) -> bool {
25 self.words.contains(&word.chars().collect())
26 }
27
28 pub fn new(words: &[&'static str]) -> Self {
30 let mut set = Self::default();
31
32 for str in words {
33 set.add(str);
34 }
35
36 set
37 }
38}
39
40impl SingleTokenPattern for WordSet {
41 fn matches_token(&self, token: &Token, source: &[char]) -> bool {
42 if !token.kind.is_word() {
43 return false;
44 }
45
46 let tok_chars = token.span.get_content(source);
47
48 for word in &self.words {
49 if tok_chars.len() != word.len() {
50 continue;
51 }
52
53 fn canonical(c: &char) -> char {
54 match c {
55 '\u{2018}' | '\u{2019}' | '\u{02BC}' | '\u{FF07}' => '\'',
56 '\u{201C}' | '\u{201D}' | '\u{FF02}' => '"',
57 '\u{2013}' | '\u{2014}' | '\u{2212}' | '\u{FF0D}' => '-',
58 _ => *c,
59 }
60 }
61
62 let partial_match = tok_chars
63 .iter()
64 .map(canonical)
65 .zip(word.iter().map(canonical))
66 .all(|(a, b)| a.eq_ignore_ascii_case(&b));
67
68 if partial_match {
69 return true;
70 }
71 }
72
73 false
74 }
75}
76
77#[cfg(test)]
78mod tests {
79 use crate::{Document, Span, patterns::DocPattern};
80
81 use super::WordSet;
82
83 #[test]
84 fn fruit() {
85 let set = WordSet::new(&["banana", "apple", "orange"]);
86
87 let doc = Document::new_markdown_default_curated("I ate a banana and an apple today.");
88
89 let matches = set.find_all_matches_in_doc(&doc);
90
91 assert_eq!(matches, vec![Span::new(6, 7), Span::new(12, 13)]);
92 }
93
94 #[test]
95 fn fruit_whack_capitalization() {
96 let set = WordSet::new(&["banana", "apple", "orange"]);
97
98 let doc = Document::new_markdown_default_curated("I Ate A bAnaNa And aN apPlE today.");
99
100 let matches = set.find_all_matches_in_doc(&doc);
101
102 assert_eq!(matches, vec![Span::new(6, 7), Span::new(12, 13)]);
103 }
104
105 #[test]
106 fn supports_typographic_apostrophes() {
107 let set = WordSet::new(&["They're"]);
108
109 let doc = Document::new_markdown_default_curated("They’re");
110
111 let matches = set.find_all_matches_in_doc(&doc);
112
113 assert_eq!(matches, vec![Span::new(0, 1)]);
114 }
115}