harper_core/patterns/
sequence_pattern.rs1use hashbrown::HashSet;
2use paste::paste;
3
4use super::whitespace_pattern::WhitespacePattern;
5use super::{
6 AnyCapitalization, AnyPattern, IndefiniteArticle, NounPhrase, Pattern, RepeatingPattern,
7 SingularSubject, WordSet,
8};
9use crate::Lrc;
10use crate::{CharStringExt, Token, TokenKind};
11
12#[derive(Default)]
34pub struct SequencePattern {
35 token_patterns: Vec<Box<dyn Pattern>>,
36}
37
38macro_rules! gen_then_from_is {
40 ($quality:ident) => {
41 paste! {
42 pub fn [< then_$quality >] (mut self) -> Self{
43 self.token_patterns.push(Box::new(|tok: &Token, _source: &[char]| {
44 tok.kind.[< is_$quality >]()
45 }));
46
47 self
48 }
49
50 pub fn [< then_one_or_more_$quality s >] (self) -> Self{
51 self.then_one_or_more(Box::new(|tok: &Token, _source: &[char]| {
52 tok.kind.[< is_$quality >]()
53 }))
54 }
55
56 pub fn [< then_anything_but_$quality >] (mut self) -> Self{
57 self.token_patterns.push(Box::new(|tok: &Token, _source: &[char]| {
58 if tok.kind.[< is_$quality >](){
59 false
60 }else{
61 true
62 }
63 }));
64
65 self
66 }
67 }
68 };
69}
70
71impl SequencePattern {
72 gen_then_from_is!(noun);
73 gen_then_from_is!(possessive_noun);
74 gen_then_from_is!(plural_noun);
75 gen_then_from_is!(verb);
76 gen_then_from_is!(linking_verb);
77 gen_then_from_is!(pronoun);
78 gen_then_from_is!(punctuation);
79 gen_then_from_is!(conjunction);
80 gen_then_from_is!(comma);
81 gen_then_from_is!(period);
82 gen_then_from_is!(number);
83 gen_then_from_is!(case_separator);
84 gen_then_from_is!(adverb);
85 gen_then_from_is!(adjective);
86 gen_then_from_is!(apostrophe);
87 gen_then_from_is!(hyphen);
88 gen_then_from_is!(article);
89 gen_then_from_is!(proper_noun);
90
91 pub fn then_word_set(self, set: WordSet) -> Self {
92 self.then(Box::new(set))
93 }
94
95 pub fn then_indefinite_article(self) -> Self {
96 self.then(Box::new(IndefiniteArticle::default()))
97 }
98
99 pub fn then_noun_phrase(self) -> Self {
101 self.then(Box::new(NounPhrase))
102 }
103
104 pub fn then_exact_word(mut self, word: &'static str) -> Self {
105 self.token_patterns
106 .push(Box::new(|tok: &Token, source: &[char]| {
107 if !tok.kind.is_word() {
108 return false;
109 }
110
111 let tok_chars = tok.span.get_content(source);
112
113 let mut w_char_count = 0;
114 for (i, w_char) in word.chars().enumerate() {
115 w_char_count += 1;
116
117 if tok_chars.get(i).cloned() != Some(w_char) {
118 return false;
119 }
120 }
121
122 w_char_count == tok_chars.len()
123 }));
124 self
125 }
126
127 pub fn then_singular_subject(self) -> Self {
128 self.then(Box::new(SingularSubject::default()))
129 }
130
131 pub fn aco(word: &'static str) -> Self {
133 Self::any_capitalization_of(word)
134 }
135
136 pub fn any_capitalization_of(word: &'static str) -> Self {
137 Self::default().then_any_capitalization_of(word)
138 }
139
140 pub fn t_aco(self, word: &'static str) -> Self {
142 self.then_any_capitalization_of(word)
143 }
144
145 pub fn then_any_capitalization_of(mut self, word: &'static str) -> Self {
147 self.token_patterns
148 .push(Box::new(AnyCapitalization::from_string(word)));
149 self
150 }
151
152 pub fn t_eworl(self, word: &'static str) -> Self {
154 self.then_exact_word_or_lowercase(word)
155 }
156
157 pub fn then_exact_word_or_lowercase(mut self, word: &'static str) -> Self {
158 self.token_patterns
159 .push(Box::new(|tok: &Token, source: &[char]| {
160 if !tok.kind.is_word() {
161 return false;
162 }
163
164 let tok_chars = tok.span.get_content(source).to_lower();
165
166 let mut w_char_count = 0;
167 for (i, w_char) in word.to_lowercase().chars().enumerate() {
168 w_char_count += 1;
169
170 if tok_chars.get(i).cloned() != Some(w_char) {
171 return false;
172 }
173 }
174
175 w_char_count == tok_chars.len()
176 }));
177 self
178 }
179
180 pub fn then_loose(mut self, kind: TokenKind) -> Self {
181 self.token_patterns
182 .push(Box::new(move |tok: &Token, _source: &[char]| {
183 kind.with_default_data() == tok.kind.with_default_data()
184 }));
185
186 self
187 }
188
189 pub fn then_any_word(mut self) -> Self {
190 self.token_patterns
191 .push(Box::new(|tok: &Token, _source: &[char]| tok.kind.is_word()));
192 self
193 }
194
195 pub fn then_strict(mut self, kind: TokenKind) -> Self {
196 self.token_patterns
197 .push(Box::new(move |tok: &Token, _source: &[char]| {
198 tok.kind == kind
199 }));
200 self
201 }
202
203 pub fn then_whitespace(mut self) -> Self {
204 self.token_patterns.push(Box::new(WhitespacePattern));
205 self
206 }
207
208 pub fn then_any_word_in(mut self, word_set: Lrc<HashSet<&'static str>>) -> Self {
209 self.token_patterns
210 .push(Box::new(move |tok: &Token, source: &[char]| {
211 let tok_chars = tok.span.get_content(source);
212 let word: String = tok_chars.iter().collect();
213 word_set.contains(word.as_str())
214 }));
215 self
216 }
217
218 pub fn then_one_or_more(mut self, pat: Box<dyn Pattern>) -> Self {
219 self.token_patterns
220 .push(Box::new(RepeatingPattern::new(pat, 0)));
221 self
222 }
223
224 pub fn then_anything(mut self) -> Self {
225 self.token_patterns.push(Box::new(AnyPattern));
226 self
227 }
228
229 pub fn then(mut self, pat: Box<dyn Pattern>) -> Self {
230 self.token_patterns.push(pat);
231 self
232 }
233}
234
235impl Pattern for SequencePattern {
236 fn matches(&self, tokens: &[Token], source: &[char]) -> usize {
237 let mut tok_cursor = 0;
238
239 for pat in self.token_patterns.iter() {
240 let match_length = pat.matches(&tokens[tok_cursor..], source);
241
242 if match_length == 0 {
243 return 0;
244 }
245
246 tok_cursor += match_length;
247 }
248
249 tok_cursor
250 }
251}
252
253#[cfg(test)]
254mod tests {
255 use hashbrown::HashSet;
256
257 use super::SequencePattern;
258 use crate::patterns::Pattern;
259 use crate::{Document, Lrc};
260
261 #[test]
262 fn matches_n_whitespace_tokens() {
263 let pat = SequencePattern::default()
264 .then_any_word()
265 .then_whitespace()
266 .then_any_word();
267 let doc = Document::new_plain_english_curated("word\n \nword");
268
269 assert_eq!(
270 pat.matches(doc.get_tokens(), doc.get_source()),
271 doc.get_tokens().len()
272 );
273 }
274
275 #[test]
276 fn matches_specific_words() {
277 let pat = SequencePattern::default()
278 .then_exact_word("she")
279 .then_whitespace()
280 .then_exact_word("her");
281 let doc = Document::new_plain_english_curated("she her");
282
283 assert_eq!(
284 pat.matches(doc.get_tokens(), doc.get_source()),
285 doc.get_tokens().len()
286 );
287 }
288
289 #[test]
290 fn matches_sets() {
291 let mut pronouns = HashSet::new();
292 pronouns.insert("his");
293 pronouns.insert("hers");
294 let pronouns = Lrc::new(pronouns);
295
296 let pat = SequencePattern::default()
297 .then_exact_word("it")
298 .then_whitespace()
299 .then_exact_word("was")
300 .then_whitespace()
301 .then_any_word_in(pronouns);
302 let doc = Document::new_plain_english_curated("it was hers");
303
304 assert_eq!(
305 pat.matches(doc.get_tokens(), doc.get_source()),
306 doc.get_tokens().len()
307 );
308 }
309}