harper_core/patterns/
sequence_pattern.rs

1use paste::paste;
2
3use super::whitespace_pattern::WhitespacePattern;
4use super::{AnyPattern, IndefiniteArticle, Pattern, RepeatingPattern, Word};
5use crate::{Token, TokenKind};
6
7/// A pattern that checks that a sequence of other patterns match.
8/// There are specific extension methods available, but you can also use [`Self::then`] to add
9/// arbitrary patterns.
10///
11/// ## Example
12///
13/// Let's say we wanted to locate places in a [`Document`](crate::Document) where an article is followed by a noun.
14/// We can do that with a `SequencePattern`.
15///
16/// ```rust
17/// use harper_core::patterns::{SequencePattern, DocPattern};
18/// use harper_core::{Document, Span};
19///
20/// let document = Document::new_markdown_default_curated("This is a test.");
21///
22/// let pattern = SequencePattern::default().then_determiner().then_whitespace().then_nominal();
23/// let matches = pattern.find_all_matches_in_doc(&document);
24///
25/// // The pattern found that the tokens at indexes 4, 5, and 6 fit the criteria.
26/// assert_eq!(matches, vec![Span::new(4, 7)]);
27/// ```
28#[derive(Default)]
29pub struct SequencePattern {
30    token_patterns: Vec<Box<dyn Pattern>>,
31}
32
33/// Generate a `then_*` method from an available `is_*` function on [`TokenKind`].
34macro_rules! gen_then_from_is {
35    ($quality:ident) => {
36        paste! {
37            pub fn [< then_$quality >] (mut self) -> Self{
38                self.token_patterns.push(Box::new(|tok: &Token, _source: &[char]| {
39                    tok.kind.[< is_$quality >]()
40                }));
41
42                self
43            }
44
45            pub fn [< then_one_or_more_$quality s >] (self) -> Self{
46                self.then_one_or_more(Box::new(|tok: &Token, _source: &[char]| {
47                    tok.kind.[< is_$quality >]()
48                }))
49            }
50
51            pub fn [< then_anything_but_$quality >] (mut self) -> Self{
52                self.token_patterns.push(Box::new(|tok: &Token, _source: &[char]| {
53                    if tok.kind.[< is_$quality >](){
54                        false
55                    }else{
56                        true
57                    }
58                }));
59
60                self
61            }
62        }
63    };
64}
65
66impl SequencePattern {
67    gen_then_from_is!(nominal);
68    gen_then_from_is!(noun);
69    gen_then_from_is!(possessive_nominal);
70    gen_then_from_is!(plural_nominal);
71    gen_then_from_is!(verb);
72    gen_then_from_is!(auxiliary_verb);
73    gen_then_from_is!(linking_verb);
74    gen_then_from_is!(pronoun);
75    gen_then_from_is!(punctuation);
76    gen_then_from_is!(conjunction);
77    gen_then_from_is!(comma);
78    gen_then_from_is!(period);
79    gen_then_from_is!(number);
80    gen_then_from_is!(case_separator);
81    gen_then_from_is!(adverb);
82    gen_then_from_is!(adjective);
83    gen_then_from_is!(apostrophe);
84    gen_then_from_is!(hyphen);
85    gen_then_from_is!(determiner);
86    gen_then_from_is!(proper_noun);
87    gen_then_from_is!(preposition);
88    gen_then_from_is!(not_plural_nominal);
89
90    pub fn then_indefinite_article(self) -> Self {
91        self.then(IndefiniteArticle::default())
92    }
93
94    /// Match examples of `word` case-sensitively.
95    pub fn then_exact_word(mut self, word: &'static str) -> Self {
96        self.token_patterns.push(Box::new(Word::new_exact(word)));
97        self
98    }
99
100    /// Shorthand for [`Self::any_capitalization_of`].
101    pub fn aco(word: &'static str) -> Self {
102        Self::any_capitalization_of(word)
103    }
104
105    pub fn any_capitalization_of(word: &'static str) -> Self {
106        Self::default().then_any_capitalization_of(word)
107    }
108
109    /// Shorthand for [`Self::then_any_capitalization_of`].
110    pub fn t_aco(self, word: &'static str) -> Self {
111        self.then_any_capitalization_of(word)
112    }
113
114    /// Match examples of `word` that have any capitalization.
115    pub fn then_any_capitalization_of(mut self, word: &'static str) -> Self {
116        self.token_patterns.push(Box::new(Word::new(word)));
117        self
118    }
119
120    /// Matches any word.
121    pub fn then_any_word(mut self) -> Self {
122        self.token_patterns
123            .push(Box::new(|tok: &Token, _source: &[char]| tok.kind.is_word()));
124        self
125    }
126
127    /// Matches any token whose `Kind` exactly matches.
128    pub fn then_strict(mut self, kind: TokenKind) -> Self {
129        self.token_patterns
130            .push(Box::new(move |tok: &Token, _source: &[char]| {
131                tok.kind == kind
132            }));
133        self
134    }
135
136    /// Shorthand for [`Self::then_whitespace`].
137    pub fn t_ws(self) -> Self {
138        self.then_whitespace()
139    }
140
141    /// Match against one or more whitespace tokens.
142    pub fn then_whitespace(mut self) -> Self {
143        self.token_patterns.push(Box::new(WhitespacePattern));
144        self
145    }
146
147    pub fn then_one_or_more(mut self, pat: impl Pattern + 'static) -> Self {
148        self.token_patterns
149            .push(Box::new(RepeatingPattern::new(Box::new(pat), 1)));
150        self
151    }
152
153    /// Shorthand for [`Self::then_anything`].
154    pub fn t_any(self) -> Self {
155        self.then_anything()
156    }
157
158    /// Match against any single token.
159    /// More of a filler than anything else.
160    pub fn then_anything(mut self) -> Self {
161        self.token_patterns.push(Box::new(AnyPattern));
162        self
163    }
164
165    pub fn then(mut self, pat: impl Pattern + 'static) -> Self {
166        self.token_patterns.push(Box::new(pat));
167        self
168    }
169}
170
171impl Pattern for SequencePattern {
172    fn matches(&self, tokens: &[Token], source: &[char]) -> Option<usize> {
173        let mut tok_cursor = 0;
174
175        for pat in self.token_patterns.iter() {
176            let match_length = pat.matches(&tokens[tok_cursor..], source)?;
177            tok_cursor += match_length;
178        }
179
180        Some(tok_cursor)
181    }
182}
183
184#[cfg(test)]
185mod tests {
186
187    use super::SequencePattern;
188    use crate::Document;
189    use crate::patterns::{DocPattern, Pattern};
190
191    #[test]
192    fn matches_n_whitespace_tokens() {
193        let pat = SequencePattern::default()
194            .then_any_word()
195            .then_whitespace()
196            .then_any_word();
197        let doc = Document::new_plain_english_curated("word\n    \nword");
198
199        assert_eq!(
200            pat.matches(doc.get_tokens(), doc.get_source()),
201            Some(doc.get_tokens().len())
202        );
203    }
204
205    #[test]
206    fn matches_specific_words() {
207        let pat = SequencePattern::default()
208            .then_exact_word("she")
209            .then_whitespace()
210            .then_exact_word("her");
211        let doc = Document::new_plain_english_curated("she her");
212
213        assert_eq!(
214            pat.matches(doc.get_tokens(), doc.get_source()),
215            Some(doc.get_tokens().len())
216        );
217    }
218
219    #[test]
220    fn match_t_aco_and_t_ws() {
221        let pat = SequencePattern::aco("foo").t_ws().t_aco("bar");
222        let doc = Document::new_plain_english_curated("foo\nBAR");
223
224        assert_eq!(
225            pat.matches(doc.get_tokens(), doc.get_source()),
226            Some(doc.get_tokens().len())
227        );
228    }
229
230    #[test]
231    fn exact_word_matches_title_case() {
232        let pat = SequencePattern::default().then_exact_word("Foo");
233        let doc = Document::new_plain_english_curated("Foo");
234
235        assert_eq!(
236            pat.matches(doc.get_tokens(), doc.get_source()),
237            Some(doc.get_tokens().len())
238        );
239    }
240
241    #[test]
242    fn exact_means_case_sensitive() {
243        let pat = SequencePattern::default().then_exact_word("Foo");
244        let doc = Document::new_plain_english_curated("foo Foo FOO");
245        let matches = pat.find_all_matches_in_doc(&doc);
246        assert_eq!(matches.len(), 1); // Only "Foo" should match
247    }
248
249    #[test]
250    fn any_capitalization_of_matches_different_cases() {
251        let pat = SequencePattern::aco("foo");
252        let doc = Document::new_plain_english_curated("foo Foo FOO");
253        let matches = pat.find_all_matches_in_doc(&doc);
254        assert_eq!(matches.len(), 3); // All three should match
255    }
256}