harper_core/expr/
sequence_expr.rs

1use paste::paste;
2
3use crate::{
4    CharStringExt, Span, Token, TokenKind,
5    expr::{FirstMatchOf, FixedPhrase, LongestMatchOf},
6    patterns::{AnyPattern, IndefiniteArticle, WhitespacePattern, Word, WordSet},
7};
8
9use super::{Expr, Optional, Repeating, Step, UnlessStep};
10
11#[derive(Default)]
12pub struct SequenceExpr {
13    exprs: Vec<Box<dyn Expr>>,
14}
15
16/// Generate a `then_*` method from an available `is_*` function on [`TokenKind`].
17macro_rules! gen_then_from_is {
18    ($quality:ident) => {
19        paste! {
20            #[doc = concat!("Adds a step matching a token where [`TokenKind::is_", stringify!($quality), "()`] returns true.")]
21            pub fn [< then_$quality >] (self) -> Self{
22                self.then(|tok: &Token, _source: &[char]| {
23                    tok.kind.[< is_$quality >]()
24                })
25            }
26
27            #[doc = concat!("Adds an optional step matching a token where [`TokenKind::is_", stringify!($quality), "()`] returns true.")]
28            pub fn [< then_optional_$quality >] (self) -> Self{
29                self.then_optional(|tok: &Token, _source: &[char]| {
30                    tok.kind.[< is_$quality >]()
31                })
32            }
33
34            #[doc = concat!("Adds a step matching one or more consecutive tokens where [`TokenKind::is_", stringify!($quality), "()`] returns true.")]
35            pub fn [< then_one_or_more_$quality s >] (self) -> Self{
36                self.then_one_or_more(Box::new(|tok: &Token, _source: &[char]| {
37                    tok.kind.[< is_$quality >]()
38                }))
39            }
40
41            #[doc = concat!("Adds a step matching a token where [`TokenKind::is_", stringify!($quality), "()`] returns false.")]
42            pub fn [< then_anything_but_$quality >] (self) -> Self{
43                self.then(|tok: &Token, _source: &[char]| {
44                    if tok.kind.[< is_$quality >](){
45                        false
46                    }else{
47                        true
48                    }
49                })
50            }
51        }
52    };
53}
54
55impl Expr for SequenceExpr {
56    /// Run the expression starting at an index, returning the total matched window.
57    ///
58    /// If any step returns `None`, the entire expression does as well.
59    fn run(&self, mut cursor: usize, tokens: &[Token], source: &[char]) -> Option<Span<Token>> {
60        let mut window = Span::new_with_len(cursor, 0);
61
62        for cur_expr in &self.exprs {
63            let out = cur_expr.run(cursor, tokens, source)?;
64
65            // Only expand the window if the match actually covers some tokens
66            if out.end > out.start {
67                window.expand_to_include(out.start);
68                window.expand_to_include(out.end.checked_sub(1).unwrap_or(out.start));
69            }
70
71            // Only advance cursor if we actually matched something
72            if out.end > cursor {
73                cursor = out.end;
74            } else if out.start < cursor {
75                cursor = out.start;
76            }
77            // If both start and end are equal to cursor, don't move the cursor
78        }
79
80        Some(window)
81    }
82}
83
84impl SequenceExpr {
85    // Constructor methods
86
87    /// Construct a new sequence with a [`Word`] at the beginning of the operation list.
88    pub fn any_capitalization_of(word: &'static str) -> Self {
89        Self::default().then_any_capitalization_of(word)
90    }
91
92    /// Shorthand for [`Self::any_capitalization_of`].
93    pub fn aco(word: &'static str) -> Self {
94        Self::any_capitalization_of(word)
95    }
96
97    /// Match the first of multiple expressions.
98    pub fn any_of(exprs: Vec<Box<dyn Expr>>) -> Self {
99        Self::default().then_any_of(exprs)
100    }
101
102    /// Match any word from the given set of words, case-insensitive.
103    pub fn word_set(words: &'static [&'static str]) -> Self {
104        Self::default().then_word_set(words)
105    }
106
107    // General builder methods
108
109    /// Push an [expression](Expr) to the operation list.
110    pub fn then(mut self, expr: impl Expr + 'static) -> Self {
111        self.exprs.push(Box::new(expr));
112        self
113    }
114
115    /// Pushes an expression that could move the cursor to the sequence, but does not require it.
116    pub fn then_optional(mut self, expr: impl Expr + 'static) -> Self {
117        self.exprs.push(Box::new(Optional::new(expr)));
118        self
119    }
120
121    /// Pushes an expression that will match any of the provided expressions.
122    ///
123    /// If more than one of the provided expressions match, this function provides no guarantee
124    /// as to which match will end up being used. If you need to get the longest of multiple
125    /// matches, use [`Self::then_longest_of()`] instead.
126    pub fn then_any_of(mut self, exprs: Vec<Box<dyn Expr>>) -> Self {
127        self.exprs.push(Box::new(FirstMatchOf::new(exprs)));
128        self
129    }
130
131    /// Pushes an expression that will match the longest of the provided expressions.
132    ///
133    /// If you don't need the longest match, prefer using the short-circuiting
134    /// [`Self::then_any_of()`] instead.
135    pub fn then_longest_of(mut self, exprs: Vec<Box<dyn Expr>>) -> Self {
136        self.exprs.push(Box::new(LongestMatchOf::new(exprs)));
137        self
138    }
139
140    /// Appends the steps in `other` onto the end of `self`.
141    /// This is more efficient than [`Self::then`] because it avoids pointer redirection.
142    pub fn then_seq(mut self, mut other: Self) -> Self {
143        self.exprs.append(&mut other.exprs);
144        self
145    }
146
147    /// Pushes an expression that will match any word from the given set of words, case-insensitive.
148    pub fn then_word_set(self, words: &'static [&'static str]) -> Self {
149        self.then(WordSet::new(words))
150    }
151
152    /// Matches any token whose `Kind` exactly matches.
153    pub fn then_strict(self, kind: TokenKind) -> Self {
154        self.then(move |tok: &Token, _source: &[char]| tok.kind == kind)
155    }
156
157    /// Match against one or more whitespace tokens.
158    pub fn then_whitespace(self) -> Self {
159        self.then(WhitespacePattern)
160    }
161
162    /// Shorthand for [`Self::then_whitespace`].
163    pub fn t_ws(self) -> Self {
164        self.then_whitespace()
165    }
166
167    pub fn then_one_or_more(self, expr: impl Expr + 'static) -> Self {
168        self.then(Repeating::new(Box::new(expr), 1))
169    }
170
171    /// Create a new condition that will step one token forward if met.
172    /// If the condition is _not_ met, the whole expression returns `None`.
173    ///
174    /// This can be used to build out exceptions to other rules.
175    ///
176    /// See [`UnlessStep`] for more info.
177    pub fn then_unless(self, condition: impl Expr + 'static) -> Self {
178        self.then(UnlessStep::new(condition, |_tok: &Token, _src: &[char]| {
179            true
180        }))
181    }
182
183    /// Match any single token.
184    ///
185    /// See [`AnyPattern`] for more info.
186    pub fn then_anything(self) -> Self {
187        self.then(AnyPattern)
188    }
189
190    /// Match any single token.
191    ///
192    /// Shorthand for [`Self::then_anything`].
193    pub fn t_any(self) -> Self {
194        self.then_anything()
195    }
196
197    // Word matching methods
198
199    /// Matches any word.
200    pub fn then_any_word(self) -> Self {
201        self.then(|tok: &Token, _source: &[char]| tok.kind.is_word())
202    }
203
204    /// Match examples of `word` that have any capitalization.
205    pub fn then_any_capitalization_of(self, word: &'static str) -> Self {
206        self.then(Word::new(word))
207    }
208
209    /// Shorthand for [`Self::then_any_capitalization_of`].
210    pub fn t_aco(self, word: &'static str) -> Self {
211        self.then_any_capitalization_of(word)
212    }
213
214    /// Match examples of `word` case-sensitively.
215    pub fn then_exact_word(self, word: &'static str) -> Self {
216        self.then(Word::new_exact(word))
217    }
218
219    /// Match a fixed phrase.
220    pub fn then_fixed_phrase(self, phrase: &'static str) -> Self {
221        self.then(FixedPhrase::from_phrase(phrase))
222    }
223
224    /// Match any word except the ones in `words`.
225    pub fn then_word_except(self, words: &'static [&'static str]) -> Self {
226        self.then(move |tok: &Token, src: &[char]| {
227            !tok.kind.is_word()
228                || !words
229                    .iter()
230                    .any(|&word| tok.span.get_content(src).eq_ignore_ascii_case_str(word))
231        })
232    }
233
234    // Predicate matching methods
235
236    /// Match a token of a given kind which is not in the list of words.
237    pub fn then_kind_except<F>(self, pred: F, words: &'static [&'static str]) -> Self
238    where
239        F: Fn(&TokenKind) -> bool + Send + Sync + 'static,
240    {
241        self.then(move |tok: &Token, src: &[char]| {
242            pred(&tok.kind)
243                && !words
244                    .iter()
245                    .any(|&word| tok.span.get_content(src).eq_ignore_ascii_case_str(word))
246        })
247    }
248
249    /// Adds a step matching a token where both token kind predicates return true.
250    pub fn then_kind_both<F1, F2>(self, pred1: F1, pred2: F2) -> Self
251    where
252        F1: Fn(&TokenKind) -> bool + Send + Sync + 'static,
253        F2: Fn(&TokenKind) -> bool + Send + Sync + 'static,
254    {
255        self.then(move |tok: &Token, _source: &[char]| pred1(&tok.kind) && pred2(&tok.kind))
256    }
257
258    /// Adds a step matching a token where either of the two token kind predicates returns true.
259    pub fn then_kind_either<F1, F2>(self, pred1: F1, pred2: F2) -> Self
260    where
261        F1: Fn(&TokenKind) -> bool + Send + Sync + 'static,
262        F2: Fn(&TokenKind) -> bool + Send + Sync + 'static,
263    {
264        self.then(move |tok: &Token, _source: &[char]| pred1(&tok.kind) || pred2(&tok.kind))
265    }
266
267    /// Adds a step matching a token where any of the token kind predicates returns true.
268    pub fn then_kind_any<F>(self, preds: &'static [F]) -> Self
269    where
270        F: Fn(&TokenKind) -> bool + Send + Sync + 'static,
271    {
272        self.then(move |tok: &Token, _source: &[char]| preds.iter().any(|pred| pred(&tok.kind)))
273    }
274
275    pub fn then_kind_any_or_words<F>(
276        self,
277        preds: &'static [F],
278        words: &'static [&'static str],
279    ) -> Self
280    where
281        F: Fn(&TokenKind) -> bool + Send + Sync + 'static,
282    {
283        self.then(move |tok: &Token, src: &[char]| {
284            preds.iter().any(|pred| pred(&tok.kind))
285                // && !words
286                || words
287                    .iter()
288                    .any(|&word| tok.span.get_content(src).eq_ignore_ascii_case_str(word))
289        })
290    }
291
292    /// Adds a step matching a token where the first token kind predicate returns true and the second returns false.
293    pub fn then_kind_is_but_is_not<F1, F2>(self, pred1: F1, pred2: F2) -> Self
294    where
295        F1: Fn(&TokenKind) -> bool + Send + Sync + 'static,
296        F2: Fn(&TokenKind) -> bool + Send + Sync + 'static,
297    {
298        self.then(move |tok: &Token, _source: &[char]| pred1(&tok.kind) && !pred2(&tok.kind))
299    }
300
301    /// Adds a step matching a token where the first token kind predicate returns true and the second returns false,
302    /// and the token is not in the list of words.
303    pub fn then_kind_is_but_is_not_except<F1, F2>(
304        self,
305        pred1: F1,
306        pred2: F2,
307        words: &'static [&'static str],
308    ) -> Self
309    where
310        F1: Fn(&TokenKind) -> bool + Send + Sync + 'static,
311        F2: Fn(&TokenKind) -> bool + Send + Sync + 'static,
312    {
313        self.then(move |tok: &Token, src: &[char]| {
314            pred1(&tok.kind)
315                && !pred2(&tok.kind)
316                && !words
317                    .iter()
318                    .any(|&word| tok.span.get_content(src).eq_ignore_ascii_case_str(word))
319        })
320    }
321
322    // Word property matching methods
323
324    // Out-of-vocabulary word. (Words not in the dictionary)
325    gen_then_from_is!(oov);
326    gen_then_from_is!(swear);
327
328    // Part-of-speech matching methods
329
330    // Nominals (nouns and pronouns)
331
332    gen_then_from_is!(nominal);
333    gen_then_from_is!(plural_nominal);
334    gen_then_from_is!(non_plural_nominal);
335    gen_then_from_is!(possessive_nominal);
336
337    // Nouns
338
339    gen_then_from_is!(noun);
340    gen_then_from_is!(proper_noun);
341    gen_then_from_is!(mass_noun_only);
342
343    // Pronouns
344
345    gen_then_from_is!(pronoun);
346    gen_then_from_is!(personal_pronoun);
347    gen_then_from_is!(first_person_singular_pronoun);
348    gen_then_from_is!(first_person_plural_pronoun);
349    gen_then_from_is!(second_person_pronoun);
350    gen_then_from_is!(third_person_pronoun);
351    gen_then_from_is!(third_person_singular_pronoun);
352    gen_then_from_is!(third_person_plural_pronoun);
353    gen_then_from_is!(object_pronoun);
354
355    // Verbs
356
357    gen_then_from_is!(verb);
358    gen_then_from_is!(auxiliary_verb);
359    gen_then_from_is!(linking_verb);
360
361    // Adjectives
362
363    gen_then_from_is!(adjective);
364    gen_then_from_is!(positive_adjective);
365    gen_then_from_is!(comparative_adjective);
366    gen_then_from_is!(superlative_adjective);
367
368    // Adverbs
369
370    gen_then_from_is!(adverb);
371
372    // Determiners
373
374    gen_then_from_is!(determiner);
375    gen_then_from_is!(demonstrative_determiner);
376    gen_then_from_is!(quantifier);
377    gen_then_from_is!(non_quantifier_determiner);
378
379    /// Push an [`IndefiniteArticle`] to the end of the operation list.
380    pub fn then_indefinite_article(self) -> Self {
381        self.then(IndefiniteArticle::default())
382    }
383
384    // Other parts of speech
385
386    gen_then_from_is!(conjunction);
387    gen_then_from_is!(preposition);
388
389    // Punctuation
390
391    gen_then_from_is!(punctuation);
392    gen_then_from_is!(apostrophe);
393    gen_then_from_is!(comma);
394    gen_then_from_is!(hyphen);
395    gen_then_from_is!(period);
396    gen_then_from_is!(semicolon);
397
398    // Other
399
400    gen_then_from_is!(number);
401    gen_then_from_is!(case_separator);
402    gen_then_from_is!(likely_homograph);
403}
404
405impl<S> From<S> for SequenceExpr
406where
407    S: Step + 'static,
408{
409    fn from(step: S) -> Self {
410        Self {
411            exprs: vec![Box::new(step)],
412        }
413    }
414}
415
416#[cfg(test)]
417mod tests {
418    use crate::{
419        Document, TokenKind,
420        expr::{ExprExt, SequenceExpr},
421        linting::tests::SpanVecExt,
422    };
423
424    #[test]
425    fn test_kind_both() {
426        let noun_and_verb =
427            SequenceExpr::default().then_kind_both(TokenKind::is_noun, TokenKind::is_verb);
428        let doc = Document::new_plain_english_curated("Use a good example.");
429        let matches = noun_and_verb.iter_matches_in_doc(&doc).collect::<Vec<_>>();
430        assert_eq!(matches.to_strings(&doc), vec!["Use", "good", "example"]);
431    }
432
433    #[test]
434    fn test_adjective_or_determiner() {
435        let expr = SequenceExpr::default()
436            .then_kind_either(TokenKind::is_adjective, TokenKind::is_determiner);
437        let doc = Document::new_plain_english_curated("Use a good example.");
438        let matches = expr.iter_matches_in_doc(&doc).collect::<Vec<_>>();
439        assert_eq!(matches.to_strings(&doc), vec!["a", "good"]);
440    }
441
442    #[test]
443    fn test_noun_but_not_adjective() {
444        let expr = SequenceExpr::default()
445            .then_kind_is_but_is_not(TokenKind::is_noun, TokenKind::is_adjective);
446        let doc = Document::new_plain_english_curated("Use a good example.");
447        let matches = expr.iter_matches_in_doc(&doc).collect::<Vec<_>>();
448        assert_eq!(matches.to_strings(&doc), vec!["Use", "example"]);
449    }
450}