harper_core/patterns/
sequence_pattern.rs

1use paste::paste;
2
3use super::whitespace_pattern::WhitespacePattern;
4use super::{
5    AnyCapitalization, AnyPattern, IndefiniteArticle, Pattern, RepeatingPattern, SingularSubject,
6};
7use crate::{Token, TokenKind};
8
9/// A pattern that checks that a sequence of other patterns match.
10/// There are specific extension methods available, but you can also use [`Self::then`] to add
11/// arbitrary patterns.
12///
13/// ## Example
14///
15/// Let's say we wanted to locate places in a [`Document`](crate::Document) where an article is followed by a noun.
16/// We can do that with a `SequencePattern`.
17///
18/// ```rust
19/// use harper_core::patterns::{SequencePattern, DocPattern};
20/// use harper_core::{Document, Span};
21///
22/// let document = Document::new_markdown_default_curated("This is a test.");
23///
24/// let pattern = SequencePattern::default().then_determiner().then_whitespace().then_nominal();
25/// let matches = pattern.find_all_matches_in_doc(&document);
26///
27/// // The pattern found that the tokens at indexes 4, 5, and 6 fit the criteria.
28/// assert_eq!(matches, vec![Span::new(4, 7)]);
29/// ```
30#[derive(Default)]
31pub struct SequencePattern {
32    token_patterns: Vec<Box<dyn Pattern>>,
33}
34
35/// Generate a `then_*` method from an available `is_*` function on [`TokenKind`].
36macro_rules! gen_then_from_is {
37    ($quality:ident) => {
38        paste! {
39            pub fn [< then_$quality >] (mut self) -> Self{
40                self.token_patterns.push(Box::new(|tok: &Token, _source: &[char]| {
41                    tok.kind.[< is_$quality >]()
42                }));
43
44                self
45            }
46
47            pub fn [< then_one_or_more_$quality s >] (self) -> Self{
48                self.then_one_or_more(Box::new(|tok: &Token, _source: &[char]| {
49                    tok.kind.[< is_$quality >]()
50                }))
51            }
52
53            pub fn [< then_anything_but_$quality >] (mut self) -> Self{
54                self.token_patterns.push(Box::new(|tok: &Token, _source: &[char]| {
55                    if tok.kind.[< is_$quality >](){
56                        false
57                    }else{
58                        true
59                    }
60                }));
61
62                self
63            }
64        }
65    };
66}
67
68impl SequencePattern {
69    gen_then_from_is!(nominal);
70    gen_then_from_is!(noun);
71    gen_then_from_is!(possessive_nominal);
72    gen_then_from_is!(plural_nominal);
73    gen_then_from_is!(verb);
74    gen_then_from_is!(linking_verb);
75    gen_then_from_is!(pronoun);
76    gen_then_from_is!(punctuation);
77    gen_then_from_is!(conjunction);
78    gen_then_from_is!(comma);
79    gen_then_from_is!(period);
80    gen_then_from_is!(number);
81    gen_then_from_is!(case_separator);
82    gen_then_from_is!(adverb);
83    gen_then_from_is!(adjective);
84    gen_then_from_is!(apostrophe);
85    gen_then_from_is!(hyphen);
86    gen_then_from_is!(determiner);
87    gen_then_from_is!(proper_noun);
88    gen_then_from_is!(preposition);
89
90    pub fn then_indefinite_article(self) -> Self {
91        self.then(IndefiniteArticle::default())
92    }
93
94    pub fn then_exact_word(mut self, word: &'static str) -> Self {
95        self.token_patterns
96            .push(Box::new(|tok: &Token, source: &[char]| {
97                if !tok.kind.is_word() {
98                    return false;
99                }
100
101                let tok_chars = tok.span.get_content(source);
102
103                let mut w_char_count = 0;
104                for (i, w_char) in word.chars().enumerate() {
105                    w_char_count += 1;
106
107                    if tok_chars.get(i).cloned() != Some(w_char) {
108                        return false;
109                    }
110                }
111
112                w_char_count == tok_chars.len()
113            }));
114        self
115    }
116
117    pub fn then_singular_subject(self) -> Self {
118        self.then(SingularSubject::default())
119    }
120
121    /// Shorthand for [`Self::any_capitalization_of`].
122    pub fn aco(word: &'static str) -> Self {
123        Self::any_capitalization_of(word)
124    }
125
126    pub fn any_capitalization_of(word: &'static str) -> Self {
127        Self::default().then_any_capitalization_of(word)
128    }
129
130    /// Shorthand for [`Self::then_any_capitalization_of`].
131    pub fn t_aco(self, word: &'static str) -> Self {
132        self.then_any_capitalization_of(word)
133    }
134
135    /// Match examples of `word` that have any capitalization.
136    pub fn then_any_capitalization_of(mut self, word: &'static str) -> Self {
137        self.token_patterns
138            .push(Box::new(AnyCapitalization::of(word)));
139        self
140    }
141
142    /// Matches any word.
143    pub fn then_any_word(mut self) -> Self {
144        self.token_patterns
145            .push(Box::new(|tok: &Token, _source: &[char]| tok.kind.is_word()));
146        self
147    }
148
149    /// Matches any token whose `Kind` exactly matches.
150    pub fn then_strict(mut self, kind: TokenKind) -> Self {
151        self.token_patterns
152            .push(Box::new(move |tok: &Token, _source: &[char]| {
153                tok.kind == kind
154            }));
155        self
156    }
157
158    /// Match against one or more whitespace tokens.
159    pub fn then_whitespace(mut self) -> Self {
160        self.token_patterns.push(Box::new(WhitespacePattern));
161        self
162    }
163
164    pub fn then_one_or_more(mut self, pat: impl Pattern + 'static) -> Self {
165        self.token_patterns
166            .push(Box::new(RepeatingPattern::new(Box::new(pat), 0)));
167        self
168    }
169
170    /// Match against any single token.
171    /// More of a filler than anything else.
172    pub fn then_anything(mut self) -> Self {
173        self.token_patterns.push(Box::new(AnyPattern));
174        self
175    }
176
177    pub fn then(mut self, pat: impl Pattern + 'static) -> Self {
178        self.token_patterns.push(Box::new(pat));
179        self
180    }
181}
182
183impl Pattern for SequencePattern {
184    fn matches(&self, tokens: &[Token], source: &[char]) -> usize {
185        let mut tok_cursor = 0;
186
187        for pat in self.token_patterns.iter() {
188            let match_length = pat.matches(&tokens[tok_cursor..], source);
189
190            if match_length == 0 {
191                return 0;
192            }
193
194            tok_cursor += match_length;
195        }
196
197        tok_cursor
198    }
199}
200
201#[cfg(test)]
202mod tests {
203
204    use super::SequencePattern;
205    use crate::Document;
206    use crate::patterns::Pattern;
207
208    #[test]
209    fn matches_n_whitespace_tokens() {
210        let pat = SequencePattern::default()
211            .then_any_word()
212            .then_whitespace()
213            .then_any_word();
214        let doc = Document::new_plain_english_curated("word\n    \nword");
215
216        assert_eq!(
217            pat.matches(doc.get_tokens(), doc.get_source()),
218            doc.get_tokens().len()
219        );
220    }
221
222    #[test]
223    fn matches_specific_words() {
224        let pat = SequencePattern::default()
225            .then_exact_word("she")
226            .then_whitespace()
227            .then_exact_word("her");
228        let doc = Document::new_plain_english_curated("she her");
229
230        assert_eq!(
231            pat.matches(doc.get_tokens(), doc.get_source()),
232            doc.get_tokens().len()
233        );
234    }
235}