harper_core/patterns/
sequence_pattern.rs

1use paste::paste;
2
3use super::whitespace_pattern::WhitespacePattern;
4use super::{
5    AnyCapitalization, AnyPattern, IndefiniteArticle, Pattern, RepeatingPattern, SingularSubject,
6};
7use crate::{Token, TokenKind};
8
9/// A pattern that checks that a sequence of other patterns match.
10/// There are specific extension methods available, but you can also use [`Self::then`] to add
11/// arbitrary patterns.
12///
13/// ## Example
14///
15/// Let's say we wanted to locate places in a [`Document`](crate::Document) where an article is followed by a noun.
16/// We can do that with a `SequencePattern`.
17///
18/// ```rust
19/// use harper_core::patterns::{SequencePattern, DocPattern};
20/// use harper_core::{Document, Span};
21///
22/// let document = Document::new_markdown_default_curated("This is a test.");
23///
24/// let pattern = SequencePattern::default().then_article().then_whitespace().then_noun();
25/// let matches = pattern.find_all_matches_in_doc(&document);
26///
27/// // The pattern found that the tokens at indexes 4, 5, and 6 fit the criteria.
28/// assert_eq!(matches, vec![Span::new(4, 7)]);
29/// ```
30#[derive(Default)]
31pub struct SequencePattern {
32    token_patterns: Vec<Box<dyn Pattern>>,
33}
34
35/// Generate a `then_*` method from an available `is_*` function on [`TokenKind`].
36macro_rules! gen_then_from_is {
37    ($quality:ident) => {
38        paste! {
39            pub fn [< then_$quality >] (mut self) -> Self{
40                self.token_patterns.push(Box::new(|tok: &Token, _source: &[char]| {
41                    tok.kind.[< is_$quality >]()
42                }));
43
44                self
45            }
46
47            pub fn [< then_one_or_more_$quality s >] (self) -> Self{
48                self.then_one_or_more(Box::new(|tok: &Token, _source: &[char]| {
49                    tok.kind.[< is_$quality >]()
50                }))
51            }
52
53            pub fn [< then_anything_but_$quality >] (mut self) -> Self{
54                self.token_patterns.push(Box::new(|tok: &Token, _source: &[char]| {
55                    if tok.kind.[< is_$quality >](){
56                        false
57                    }else{
58                        true
59                    }
60                }));
61
62                self
63            }
64        }
65    };
66}
67
68impl SequencePattern {
69    gen_then_from_is!(noun);
70    gen_then_from_is!(possessive_noun);
71    gen_then_from_is!(plural_noun);
72    gen_then_from_is!(verb);
73    gen_then_from_is!(linking_verb);
74    gen_then_from_is!(pronoun);
75    gen_then_from_is!(punctuation);
76    gen_then_from_is!(conjunction);
77    gen_then_from_is!(comma);
78    gen_then_from_is!(period);
79    gen_then_from_is!(number);
80    gen_then_from_is!(case_separator);
81    gen_then_from_is!(adverb);
82    gen_then_from_is!(adjective);
83    gen_then_from_is!(apostrophe);
84    gen_then_from_is!(hyphen);
85    gen_then_from_is!(article);
86    gen_then_from_is!(proper_noun);
87    gen_then_from_is!(preposition);
88
89    pub fn then_indefinite_article(self) -> Self {
90        self.then(IndefiniteArticle::default())
91    }
92
93    pub fn then_exact_word(mut self, word: &'static str) -> Self {
94        self.token_patterns
95            .push(Box::new(|tok: &Token, source: &[char]| {
96                if !tok.kind.is_word() {
97                    return false;
98                }
99
100                let tok_chars = tok.span.get_content(source);
101
102                let mut w_char_count = 0;
103                for (i, w_char) in word.chars().enumerate() {
104                    w_char_count += 1;
105
106                    if tok_chars.get(i).cloned() != Some(w_char) {
107                        return false;
108                    }
109                }
110
111                w_char_count == tok_chars.len()
112            }));
113        self
114    }
115
116    pub fn then_singular_subject(self) -> Self {
117        self.then(SingularSubject::default())
118    }
119
120    /// Shorthand for [`Self::any_capitalization_of`].
121    pub fn aco(word: &'static str) -> Self {
122        Self::any_capitalization_of(word)
123    }
124
125    pub fn any_capitalization_of(word: &'static str) -> Self {
126        Self::default().then_any_capitalization_of(word)
127    }
128
129    /// Shorthand for [`Self::then_any_capitalization_of`].
130    pub fn t_aco(self, word: &'static str) -> Self {
131        self.then_any_capitalization_of(word)
132    }
133
134    /// Match examples of `word` that have any capitalization.
135    pub fn then_any_capitalization_of(mut self, word: &'static str) -> Self {
136        self.token_patterns
137            .push(Box::new(AnyCapitalization::of(word)));
138        self
139    }
140
141    /// Matches any word.
142    pub fn then_any_word(mut self) -> Self {
143        self.token_patterns
144            .push(Box::new(|tok: &Token, _source: &[char]| tok.kind.is_word()));
145        self
146    }
147
148    /// Matches any token whose `Kind` exactly matches.
149    pub fn then_strict(mut self, kind: TokenKind) -> Self {
150        self.token_patterns
151            .push(Box::new(move |tok: &Token, _source: &[char]| {
152                tok.kind == kind
153            }));
154        self
155    }
156
157    /// Match against one or more whitespace tokens.
158    pub fn then_whitespace(mut self) -> Self {
159        self.token_patterns.push(Box::new(WhitespacePattern));
160        self
161    }
162
163    pub fn then_one_or_more(mut self, pat: impl Pattern + 'static) -> Self {
164        self.token_patterns
165            .push(Box::new(RepeatingPattern::new(Box::new(pat), 0)));
166        self
167    }
168
169    /// Match against any single token.
170    /// More of a filler than anything else.
171    pub fn then_anything(mut self) -> Self {
172        self.token_patterns.push(Box::new(AnyPattern));
173        self
174    }
175
176    pub fn then(mut self, pat: impl Pattern + 'static) -> Self {
177        self.token_patterns.push(Box::new(pat));
178        self
179    }
180}
181
182impl Pattern for SequencePattern {
183    fn matches(&self, tokens: &[Token], source: &[char]) -> usize {
184        let mut tok_cursor = 0;
185
186        for pat in self.token_patterns.iter() {
187            let match_length = pat.matches(&tokens[tok_cursor..], source);
188
189            if match_length == 0 {
190                return 0;
191            }
192
193            tok_cursor += match_length;
194        }
195
196        tok_cursor
197    }
198}
199
200#[cfg(test)]
201mod tests {
202
203    use super::SequencePattern;
204    use crate::Document;
205    use crate::patterns::Pattern;
206
207    #[test]
208    fn matches_n_whitespace_tokens() {
209        let pat = SequencePattern::default()
210            .then_any_word()
211            .then_whitespace()
212            .then_any_word();
213        let doc = Document::new_plain_english_curated("word\n    \nword");
214
215        assert_eq!(
216            pat.matches(doc.get_tokens(), doc.get_source()),
217            doc.get_tokens().len()
218        );
219    }
220
221    #[test]
222    fn matches_specific_words() {
223        let pat = SequencePattern::default()
224            .then_exact_word("she")
225            .then_whitespace()
226            .then_exact_word("her");
227        let doc = Document::new_plain_english_curated("she her");
228
229        assert_eq!(
230            pat.matches(doc.get_tokens(), doc.get_source()),
231            doc.get_tokens().len()
232        );
233    }
234}