harper_core/patterns/
sequence_pattern.rs

1use hashbrown::HashSet;
2use paste::paste;
3
4use super::whitespace_pattern::WhitespacePattern;
5use super::{
6    AnyCapitalization, AnyPattern, IndefiniteArticle, NounPhrase, Pattern, RepeatingPattern,
7    SingularSubject, WordSet,
8};
9use crate::Lrc;
10use crate::{CharStringExt, Token, TokenKind};
11
12/// A pattern that checks that a sequence of other patterns match.
13/// There are specific extension methods available, but you can also use [`Self::then`] to add
14/// arbitrary patterns.
15///
16/// ## Example
17///
18/// Let's say we wanted to locate places in a [`Document`](crate::Document) where an article is followed by a noun.
19/// We can do that with a `SequencePattern`.
20///
21/// ```rust
22/// use harper_core::patterns::{SequencePattern, DocPattern};
23/// use harper_core::{Document, Span};
24///
25/// let document = Document::new_markdown_default_curated("This is a test.");
26///
27/// let pattern = SequencePattern::default().then_article().then_whitespace().then_noun();
28/// let matches = pattern.find_all_matches_in_doc(&document);
29///
30/// // The pattern found that the tokens at indexes 4, 5, and 6 fit the criteria.
31/// assert_eq!(matches, vec![Span::new(4, 7)]);
32/// ```
33#[derive(Default)]
34pub struct SequencePattern {
35    token_patterns: Vec<Box<dyn Pattern>>,
36}
37
38/// Generate a `then_*` method from an available `is_*` function on [`TokenKind`].
39macro_rules! gen_then_from_is {
40    ($quality:ident) => {
41        paste! {
42            pub fn [< then_$quality >] (mut self) -> Self{
43                self.token_patterns.push(Box::new(|tok: &Token, _source: &[char]| {
44                    tok.kind.[< is_$quality >]()
45                }));
46
47                self
48            }
49
50            pub fn [< then_one_or_more_$quality s >] (self) -> Self{
51                self.then_one_or_more(Box::new(|tok: &Token, _source: &[char]| {
52                    tok.kind.[< is_$quality >]()
53                }))
54            }
55
56            pub fn [< then_anything_but_$quality >] (mut self) -> Self{
57                self.token_patterns.push(Box::new(|tok: &Token, _source: &[char]| {
58                    if tok.kind.[< is_$quality >](){
59                        false
60                    }else{
61                        true
62                    }
63                }));
64
65                self
66            }
67        }
68    };
69}
70
71impl SequencePattern {
72    gen_then_from_is!(noun);
73    gen_then_from_is!(possessive_noun);
74    gen_then_from_is!(plural_noun);
75    gen_then_from_is!(verb);
76    gen_then_from_is!(linking_verb);
77    gen_then_from_is!(pronoun);
78    gen_then_from_is!(punctuation);
79    gen_then_from_is!(conjunction);
80    gen_then_from_is!(comma);
81    gen_then_from_is!(period);
82    gen_then_from_is!(number);
83    gen_then_from_is!(case_separator);
84    gen_then_from_is!(adverb);
85    gen_then_from_is!(adjective);
86    gen_then_from_is!(apostrophe);
87    gen_then_from_is!(hyphen);
88    gen_then_from_is!(article);
89    gen_then_from_is!(proper_noun);
90
91    pub fn then_word_set(self, set: WordSet) -> Self {
92        self.then(Box::new(set))
93    }
94
95    pub fn then_indefinite_article(self) -> Self {
96        self.then(Box::new(IndefiniteArticle::default()))
97    }
98
99    /// Add a pattern that looks for more complex ideas, like nouns with adjectives attached.
100    pub fn then_noun_phrase(self) -> Self {
101        self.then(Box::new(NounPhrase))
102    }
103
104    pub fn then_exact_word(mut self, word: &'static str) -> Self {
105        self.token_patterns
106            .push(Box::new(|tok: &Token, source: &[char]| {
107                if !tok.kind.is_word() {
108                    return false;
109                }
110
111                let tok_chars = tok.span.get_content(source);
112
113                let mut w_char_count = 0;
114                for (i, w_char) in word.chars().enumerate() {
115                    w_char_count += 1;
116
117                    if tok_chars.get(i).cloned() != Some(w_char) {
118                        return false;
119                    }
120                }
121
122                w_char_count == tok_chars.len()
123            }));
124        self
125    }
126
127    pub fn then_singular_subject(self) -> Self {
128        self.then(Box::new(SingularSubject::default()))
129    }
130
131    /// Shorthand for [`Self::any_capitalization_of`].
132    pub fn aco(word: &'static str) -> Self {
133        Self::any_capitalization_of(word)
134    }
135
136    pub fn any_capitalization_of(word: &'static str) -> Self {
137        Self::default().then_any_capitalization_of(word)
138    }
139
140    /// Shorthand for [`Self::then_any_capitalization_of`].
141    pub fn t_aco(self, word: &'static str) -> Self {
142        self.then_any_capitalization_of(word)
143    }
144
145    /// Match examples of `word` that have any capitalization.
146    pub fn then_any_capitalization_of(mut self, word: &'static str) -> Self {
147        self.token_patterns
148            .push(Box::new(AnyCapitalization::from_string(word)));
149        self
150    }
151
152    /// Shorthand for [`Self::then_exact_word_or_lowercase`].
153    pub fn t_eworl(self, word: &'static str) -> Self {
154        self.then_exact_word_or_lowercase(word)
155    }
156
157    pub fn then_exact_word_or_lowercase(mut self, word: &'static str) -> Self {
158        self.token_patterns
159            .push(Box::new(|tok: &Token, source: &[char]| {
160                if !tok.kind.is_word() {
161                    return false;
162                }
163
164                let tok_chars = tok.span.get_content(source).to_lower();
165
166                let mut w_char_count = 0;
167                for (i, w_char) in word.to_lowercase().chars().enumerate() {
168                    w_char_count += 1;
169
170                    if tok_chars.get(i).cloned() != Some(w_char) {
171                        return false;
172                    }
173                }
174
175                w_char_count == tok_chars.len()
176            }));
177        self
178    }
179
180    pub fn then_loose(mut self, kind: TokenKind) -> Self {
181        self.token_patterns
182            .push(Box::new(move |tok: &Token, _source: &[char]| {
183                kind.with_default_data() == tok.kind.with_default_data()
184            }));
185
186        self
187    }
188
189    pub fn then_any_word(mut self) -> Self {
190        self.token_patterns
191            .push(Box::new(|tok: &Token, _source: &[char]| tok.kind.is_word()));
192        self
193    }
194
195    pub fn then_strict(mut self, kind: TokenKind) -> Self {
196        self.token_patterns
197            .push(Box::new(move |tok: &Token, _source: &[char]| {
198                tok.kind == kind
199            }));
200        self
201    }
202
203    pub fn then_whitespace(mut self) -> Self {
204        self.token_patterns.push(Box::new(WhitespacePattern));
205        self
206    }
207
208    pub fn then_any_word_in(mut self, word_set: Lrc<HashSet<&'static str>>) -> Self {
209        self.token_patterns
210            .push(Box::new(move |tok: &Token, source: &[char]| {
211                let tok_chars = tok.span.get_content(source);
212                let word: String = tok_chars.iter().collect();
213                word_set.contains(word.as_str())
214            }));
215        self
216    }
217
218    pub fn then_one_or_more(mut self, pat: Box<dyn Pattern>) -> Self {
219        self.token_patterns
220            .push(Box::new(RepeatingPattern::new(pat, 0)));
221        self
222    }
223
224    pub fn then_anything(mut self) -> Self {
225        self.token_patterns.push(Box::new(AnyPattern));
226        self
227    }
228
229    pub fn then(mut self, pat: Box<dyn Pattern>) -> Self {
230        self.token_patterns.push(pat);
231        self
232    }
233}
234
235impl Pattern for SequencePattern {
236    fn matches(&self, tokens: &[Token], source: &[char]) -> usize {
237        let mut tok_cursor = 0;
238
239        for pat in self.token_patterns.iter() {
240            let match_length = pat.matches(&tokens[tok_cursor..], source);
241
242            if match_length == 0 {
243                return 0;
244            }
245
246            tok_cursor += match_length;
247        }
248
249        tok_cursor
250    }
251}
252
253#[cfg(test)]
254mod tests {
255    use hashbrown::HashSet;
256
257    use super::SequencePattern;
258    use crate::patterns::Pattern;
259    use crate::{Document, Lrc};
260
261    #[test]
262    fn matches_n_whitespace_tokens() {
263        let pat = SequencePattern::default()
264            .then_any_word()
265            .then_whitespace()
266            .then_any_word();
267        let doc = Document::new_plain_english_curated("word\n    \nword");
268
269        assert_eq!(
270            pat.matches(doc.get_tokens(), doc.get_source()),
271            doc.get_tokens().len()
272        );
273    }
274
275    #[test]
276    fn matches_specific_words() {
277        let pat = SequencePattern::default()
278            .then_exact_word("she")
279            .then_whitespace()
280            .then_exact_word("her");
281        let doc = Document::new_plain_english_curated("she her");
282
283        assert_eq!(
284            pat.matches(doc.get_tokens(), doc.get_source()),
285            doc.get_tokens().len()
286        );
287    }
288
289    #[test]
290    fn matches_sets() {
291        let mut pronouns = HashSet::new();
292        pronouns.insert("his");
293        pronouns.insert("hers");
294        let pronouns = Lrc::new(pronouns);
295
296        let pat = SequencePattern::default()
297            .then_exact_word("it")
298            .then_whitespace()
299            .then_exact_word("was")
300            .then_whitespace()
301            .then_any_word_in(pronouns);
302        let doc = Document::new_plain_english_curated("it was hers");
303
304        assert_eq!(
305            pat.matches(doc.get_tokens(), doc.get_source()),
306            doc.get_tokens().len()
307        );
308    }
309}