harper_core/expr/
sequence_expr.rs

1use paste::paste;
2
3use crate::{
4    CharStringExt, Span, Token, TokenKind,
5    expr::{FirstMatchOf, FixedPhrase, LongestMatchOf},
6    patterns::{AnyPattern, IndefiniteArticle, WhitespacePattern, Word, WordSet},
7};
8
9use super::{Expr, Optional, OwnedExprExt, Repeating, Step, UnlessStep};
10
11#[derive(Default)]
12pub struct SequenceExpr {
13    exprs: Vec<Box<dyn Expr>>,
14}
15
16/// Generate a `then_*` method from an available `is_*` function on [`TokenKind`].
17macro_rules! gen_then_from_is {
18    ($quality:ident) => {
19        paste! {
20            #[doc = concat!("Adds a step matching a token where [`TokenKind::is_", stringify!($quality), "()`] returns true.")]
21            pub fn [< then_$quality >] (self) -> Self{
22                self.then(|tok: &Token, _source: &[char]| {
23                    tok.kind.[< is_$quality >]()
24                })
25            }
26
27            #[doc = concat!("Adds an optional step matching a token where [`TokenKind::is_", stringify!($quality), "()`] returns true.")]
28            pub fn [< then_optional_$quality >] (self) -> Self{
29                self.then_optional(|tok: &Token, _source: &[char]| {
30                    tok.kind.[< is_$quality >]()
31                })
32            }
33
34            #[doc = concat!("Adds a step matching one or more consecutive tokens where [`TokenKind::is_", stringify!($quality), "()`] returns true.")]
35            pub fn [< then_one_or_more_$quality s >] (self) -> Self{
36                self.then_one_or_more(Box::new(|tok: &Token, _source: &[char]| {
37                    tok.kind.[< is_$quality >]()
38                }))
39            }
40
41            #[doc = concat!("Adds a step matching a token where [`TokenKind::is_", stringify!($quality), "()`] returns false.")]
42            pub fn [< then_anything_but_$quality >] (self) -> Self{
43                self.then(|tok: &Token, _source: &[char]| {
44                    if tok.kind.[< is_$quality >](){
45                        false
46                    }else{
47                        true
48                    }
49                })
50            }
51        }
52    };
53}
54
55impl Expr for SequenceExpr {
56    /// Run the expression starting at an index, returning the total matched window.
57    ///
58    /// If any step returns `None`, the entire expression does as well.
59    fn run(&self, mut cursor: usize, tokens: &[Token], source: &[char]) -> Option<Span<Token>> {
60        let mut window = Span::new_with_len(cursor, 0);
61
62        for cur_expr in &self.exprs {
63            let out = cur_expr.run(cursor, tokens, source)?;
64
65            // Only expand the window if the match actually covers some tokens
66            if out.end > out.start {
67                window.expand_to_include(out.start);
68                window.expand_to_include(out.end.checked_sub(1).unwrap_or(out.start));
69            }
70
71            // Only advance cursor if we actually matched something
72            if out.end > cursor {
73                cursor = out.end;
74            } else if out.start < cursor {
75                cursor = out.start;
76            }
77            // If both start and end are equal to cursor, don't move the cursor
78        }
79
80        Some(window)
81    }
82}
83
84impl SequenceExpr {
85    // Constructor methods
86
87    // Single word token methods
88
89    /// Construct a new sequence with a [`Word`] at the beginning of the operation list.
90    pub fn any_capitalization_of(word: &'static str) -> Self {
91        Self::default().then_any_capitalization_of(word)
92    }
93
94    /// Shorthand for [`Self::any_capitalization_of`].
95    pub fn aco(word: &'static str) -> Self {
96        Self::any_capitalization_of(word)
97    }
98
99    /// Match any word from the given set of words, case-insensitive.
100    pub fn word_set(words: &'static [&'static str]) -> Self {
101        Self::default().then_word_set(words)
102    }
103
104    /// Match any word.
105    pub fn any_word() -> Self {
106        Self::default().then_any_word()
107    }
108
109    // Expressions of more than one token
110
111    /// Match a fixed phrase.
112    pub fn fixed_phrase(phrase: &'static str) -> Self {
113        Self::default().then_fixed_phrase(phrase)
114    }
115
116    // Multiple expressions
117
118    /// Match the first of multiple expressions.
119    pub fn any_of(exprs: Vec<Box<dyn Expr>>) -> Self {
120        Self::default().then_any_of(exprs)
121    }
122
123    /// Will be accepted unless the condition matches.
124    pub fn unless(condition: impl Expr + 'static) -> Self {
125        Self::default().then_unless(condition)
126    }
127
128    // Builder methods
129
130    /// Push an [expression](Expr) to the operation list.
131    pub fn then(mut self, expr: impl Expr + 'static) -> Self {
132        self.exprs.push(Box::new(expr));
133        self
134    }
135
136    /// Pushes an expression that could move the cursor to the sequence, but does not require it.
137    pub fn then_optional(mut self, expr: impl Expr + 'static) -> Self {
138        self.exprs.push(Box::new(Optional::new(expr)));
139        self
140    }
141
142    /// Pushes an expression that will match any of the provided expressions.
143    ///
144    /// If more than one of the provided expressions match, this function provides no guarantee
145    /// as to which match will end up being used. If you need to get the longest of multiple
146    /// matches, use [`Self::then_longest_of()`] instead.
147    pub fn then_any_of(mut self, exprs: Vec<Box<dyn Expr>>) -> Self {
148        self.exprs.push(Box::new(FirstMatchOf::new(exprs)));
149        self
150    }
151
152    /// Pushes an expression that will match the longest of the provided expressions.
153    ///
154    /// If you don't need the longest match, prefer using the short-circuiting
155    /// [`Self::then_any_of()`] instead.
156    pub fn then_longest_of(mut self, exprs: Vec<Box<dyn Expr>>) -> Self {
157        self.exprs.push(Box::new(LongestMatchOf::new(exprs)));
158        self
159    }
160
161    /// Appends the steps in `other` onto the end of `self`.
162    /// This is more efficient than [`Self::then`] because it avoids pointer redirection.
163    pub fn then_seq(mut self, mut other: Self) -> Self {
164        self.exprs.append(&mut other.exprs);
165        self
166    }
167
168    /// Pushes an expression that will match any word from the given set of words, case-insensitive.
169    pub fn then_word_set(self, words: &'static [&'static str]) -> Self {
170        self.then(WordSet::new(words))
171    }
172
173    /// Matches any token whose `Kind` exactly matches.
174    pub fn then_strict(self, kind: TokenKind) -> Self {
175        self.then(move |tok: &Token, _source: &[char]| tok.kind == kind)
176    }
177
178    /// Match against one or more whitespace tokens.
179    pub fn then_whitespace(self) -> Self {
180        self.then(WhitespacePattern)
181    }
182
183    /// Match against one or more whitespace tokens.
184    pub fn then_whitespace_or_hyphen(self) -> Self {
185        self.then(WhitespacePattern.or(|tok: &Token, _: &[char]| tok.kind.is_hyphen()))
186    }
187
188    /// Shorthand for [`Self::then_whitespace_or_hyphen`].
189    pub fn t_ws_h(self) -> Self {
190        self.then_whitespace_or_hyphen()
191    }
192
193    /// Shorthand for [`Self::then_whitespace`].
194    pub fn t_ws(self) -> Self {
195        self.then_whitespace()
196    }
197
198    pub fn then_one_or_more(self, expr: impl Expr + 'static) -> Self {
199        self.then(Repeating::new(Box::new(expr), 1))
200    }
201
202    /// Create a new condition that will step one token forward if met.
203    /// If the condition is _not_ met, the whole expression returns `None`.
204    ///
205    /// This can be used to build out exceptions to other rules.
206    ///
207    /// See [`UnlessStep`] for more info.
208    pub fn then_unless(self, condition: impl Expr + 'static) -> Self {
209        self.then(UnlessStep::new(condition, |_tok: &Token, _src: &[char]| {
210            true
211        }))
212    }
213
214    /// Match any single token.
215    ///
216    /// See [`AnyPattern`] for more info.
217    pub fn then_anything(self) -> Self {
218        self.then(AnyPattern)
219    }
220
221    /// Match any single token.
222    ///
223    /// Shorthand for [`Self::then_anything`].
224    pub fn t_any(self) -> Self {
225        self.then_anything()
226    }
227
228    // Word matching methods
229
230    /// Matches any word.
231    pub fn then_any_word(self) -> Self {
232        self.then(|tok: &Token, _source: &[char]| tok.kind.is_word())
233    }
234
235    /// Match examples of `word` that have any capitalization.
236    pub fn then_any_capitalization_of(self, word: &'static str) -> Self {
237        self.then(Word::new(word))
238    }
239
240    /// Shorthand for [`Self::then_any_capitalization_of`].
241    pub fn t_aco(self, word: &'static str) -> Self {
242        self.then_any_capitalization_of(word)
243    }
244
245    /// Match examples of `word` case-sensitively.
246    pub fn then_exact_word(self, word: &'static str) -> Self {
247        self.then(Word::new_exact(word))
248    }
249
250    /// Match a fixed phrase.
251    pub fn then_fixed_phrase(self, phrase: &'static str) -> Self {
252        self.then(FixedPhrase::from_phrase(phrase))
253    }
254
255    /// Match any word except the ones in `words`.
256    pub fn then_word_except(self, words: &'static [&'static str]) -> Self {
257        self.then(move |tok: &Token, src: &[char]| {
258            !tok.kind.is_word()
259                || !words
260                    .iter()
261                    .any(|&word| tok.span.get_content(src).eq_ignore_ascii_case_str(word))
262        })
263    }
264
265    // Token kind/predicate matching methods
266
267    // One kind
268
269    /// Match a token of a given kind which is not in the list of words.
270    pub fn then_kind_except<F>(self, pred_is: F, ex: &'static [&'static str]) -> Self
271    where
272        F: Fn(&TokenKind) -> bool + Send + Sync + 'static,
273    {
274        self.then(move |tok: &Token, src: &[char]| {
275            pred_is(&tok.kind)
276                && !ex
277                    .iter()
278                    .any(|&word| tok.span.get_content(src).eq_ignore_ascii_case_str(word))
279        })
280    }
281
282    // Two kinds
283
284    /// Match a token where both token kind predicates return true.
285    /// For instance, a word that can be both noun and verb.
286    pub fn then_kind_both<F1, F2>(self, pred_is_1: F1, pred_is_2: F2) -> Self
287    where
288        F1: Fn(&TokenKind) -> bool + Send + Sync + 'static,
289        F2: Fn(&TokenKind) -> bool + Send + Sync + 'static,
290    {
291        self.then(move |tok: &Token, _source: &[char]| pred_is_1(&tok.kind) && pred_is_2(&tok.kind))
292    }
293
294    /// Match a token where either of the two token kind predicates returns true.
295    /// For instance, an adjective or an adverb.
296    pub fn then_kind_either<F1, F2>(self, pred_is_1: F1, pred_is_2: F2) -> Self
297    where
298        F1: Fn(&TokenKind) -> bool + Send + Sync + 'static,
299        F2: Fn(&TokenKind) -> bool + Send + Sync + 'static,
300    {
301        self.then(move |tok: &Token, _source: &[char]| pred_is_1(&tok.kind) || pred_is_2(&tok.kind))
302    }
303
304    /// Match a token where the first token kind predicate returns true and the second returns false.
305    /// For instance, a word that can be a noun but cannot be a verb.
306    pub fn then_kind_is_but_is_not<F1, F2>(self, pred_is: F1, pred_not: F2) -> Self
307    where
308        F1: Fn(&TokenKind) -> bool + Send + Sync + 'static,
309        F2: Fn(&TokenKind) -> bool + Send + Sync + 'static,
310    {
311        self.then(move |tok: &Token, _source: &[char]| pred_is(&tok.kind) && !pred_not(&tok.kind))
312    }
313
314    /// Match a token where the first token kind predicate returns true and the second returns false,
315    /// and the token is not in the list of exceptions.
316    pub fn then_kind_is_but_is_not_except<F1, F2>(
317        self,
318        pred_is: F1,
319        pred_not: F2,
320        ex: &'static [&'static str],
321    ) -> Self
322    where
323        F1: Fn(&TokenKind) -> bool + Send + Sync + 'static,
324        F2: Fn(&TokenKind) -> bool + Send + Sync + 'static,
325    {
326        self.then(move |tok: &Token, src: &[char]| {
327            pred_is(&tok.kind)
328                && !pred_not(&tok.kind)
329                && !ex
330                    .iter()
331                    .any(|&word| tok.span.get_content(src).eq_ignore_ascii_case_str(word))
332        })
333    }
334
335    gen_then_from_is!(sentence_terminator);
336    // More than two kinds
337
338    /// Match a token where any of the token kind predicates returns true.
339    /// Like `then_kind_either` but for more than two predicates.
340    pub fn then_kind_any<F>(self, preds_is: &'static [F]) -> Self
341    where
342        F: Fn(&TokenKind) -> bool + Send + Sync + 'static,
343    {
344        self.then(move |tok: &Token, _source: &[char]| preds_is.iter().any(|pred| pred(&tok.kind)))
345    }
346
347    /// Match a token where any of the token kind predicates returns true,
348    /// and the word is not in the list of exceptions.
349    pub fn then_kind_any_except<F>(
350        self,
351        preds_is: &'static [F],
352        ex: &'static [&'static str],
353    ) -> Self
354    where
355        F: Fn(&TokenKind) -> bool + Send + Sync + 'static,
356    {
357        self.then(move |tok: &Token, src: &[char]| {
358            preds_is.iter().any(|pred| pred(&tok.kind))
359                && !ex
360                    .iter()
361                    .any(|&word| tok.span.get_content(src).eq_ignore_ascii_case_str(word))
362        })
363    }
364
365    /// Match a token where any of the token kind predicates returns true,
366    /// or the token is in the list of words.
367    pub fn then_kind_any_or_words<F>(
368        self,
369        preds: &'static [F],
370        words: &'static [&'static str],
371    ) -> Self
372    where
373        F: Fn(&TokenKind) -> bool + Send + Sync + 'static,
374    {
375        self.then(move |tok: &Token, src: &[char]| {
376            preds.iter().any(|pred| pred(&tok.kind))
377                || words
378                    .iter()
379                    .any(|&word| tok.span.get_content(src).eq_ignore_ascii_case_str(word))
380        })
381    }
382
383    /// Match a token where any of the first token kind predicates returns true,
384    /// the second returns false, and the token is not in the list of exceptions.    
385    pub fn then_kind_any_but_not_except<F1, F2>(
386        self,
387        preds_is: &'static [F1],
388        pred_not: F2,
389        ex: &'static [&'static str],
390    ) -> Self
391    where
392        F1: Fn(&TokenKind) -> bool + Send + Sync + 'static,
393        F2: Fn(&TokenKind) -> bool + Send + Sync + 'static,
394    {
395        self.then(move |tok: &Token, src: &[char]| {
396            preds_is.iter().any(|pred| pred(&tok.kind))
397                && !pred_not(&tok.kind)
398                && !ex
399                    .iter()
400                    .any(|&word| tok.span.get_content(src).eq_ignore_ascii_case_str(word))
401        })
402    }
403
404    // Word property matching methods
405
406    // Out-of-vocabulary word. (Words not in the dictionary)
407    gen_then_from_is!(oov);
408    gen_then_from_is!(swear);
409
410    // Part-of-speech matching methods
411
412    // Nominals (nouns and pronouns)
413
414    gen_then_from_is!(nominal);
415    gen_then_from_is!(plural_nominal);
416    gen_then_from_is!(non_plural_nominal);
417    gen_then_from_is!(possessive_nominal);
418
419    // Nouns
420
421    gen_then_from_is!(noun);
422    gen_then_from_is!(proper_noun);
423    gen_then_from_is!(mass_noun_only);
424
425    // Pronouns
426
427    gen_then_from_is!(pronoun);
428    gen_then_from_is!(personal_pronoun);
429    gen_then_from_is!(first_person_singular_pronoun);
430    gen_then_from_is!(first_person_plural_pronoun);
431    gen_then_from_is!(second_person_pronoun);
432    gen_then_from_is!(third_person_pronoun);
433    gen_then_from_is!(third_person_singular_pronoun);
434    gen_then_from_is!(third_person_plural_pronoun);
435    gen_then_from_is!(subject_pronoun);
436    gen_then_from_is!(object_pronoun);
437
438    // Verbs
439
440    gen_then_from_is!(verb);
441    gen_then_from_is!(auxiliary_verb);
442    gen_then_from_is!(linking_verb);
443    gen_then_from_is!(verb_lemma);
444    gen_then_from_is!(verb_simple_past_form);
445    gen_then_from_is!(verb_past_participle_form);
446
447    // Adjectives
448
449    gen_then_from_is!(adjective);
450    gen_then_from_is!(positive_adjective);
451    gen_then_from_is!(comparative_adjective);
452    gen_then_from_is!(superlative_adjective);
453
454    // Adverbs
455
456    gen_then_from_is!(adverb);
457
458    // Determiners
459
460    gen_then_from_is!(determiner);
461    gen_then_from_is!(demonstrative_determiner);
462    gen_then_from_is!(possessive_determiner);
463    gen_then_from_is!(quantifier);
464    gen_then_from_is!(non_quantifier_determiner);
465    gen_then_from_is!(non_demonstrative_determiner);
466
467    /// Push an [`IndefiniteArticle`] to the end of the operation list.
468    pub fn then_indefinite_article(self) -> Self {
469        self.then(IndefiniteArticle::default())
470    }
471
472    // Other parts of speech
473
474    gen_then_from_is!(conjunction);
475    gen_then_from_is!(preposition);
476
477    // Punctuation
478
479    gen_then_from_is!(punctuation);
480    gen_then_from_is!(apostrophe);
481    gen_then_from_is!(comma);
482    gen_then_from_is!(hyphen);
483    gen_then_from_is!(period);
484    gen_then_from_is!(semicolon);
485    gen_then_from_is!(quote);
486
487    // Other
488
489    gen_then_from_is!(number);
490    gen_then_from_is!(case_separator);
491    gen_then_from_is!(likely_homograph);
492}
493
494impl<S> From<S> for SequenceExpr
495where
496    S: Step + 'static,
497{
498    fn from(step: S) -> Self {
499        Self {
500            exprs: vec![Box::new(step)],
501        }
502    }
503}
504
505#[cfg(test)]
506mod tests {
507    use crate::{
508        Document, TokenKind,
509        expr::{ExprExt, SequenceExpr},
510        linting::tests::SpanVecExt,
511    };
512
513    #[test]
514    fn test_kind_both() {
515        let noun_and_verb =
516            SequenceExpr::default().then_kind_both(TokenKind::is_noun, TokenKind::is_verb);
517        let doc = Document::new_plain_english_curated("Use a good example.");
518        let matches = noun_and_verb.iter_matches_in_doc(&doc).collect::<Vec<_>>();
519        assert_eq!(matches.to_strings(&doc), vec!["Use", "good", "example"]);
520    }
521
522    #[test]
523    fn test_adjective_or_determiner() {
524        let expr = SequenceExpr::default()
525            .then_kind_either(TokenKind::is_adjective, TokenKind::is_determiner);
526        let doc = Document::new_plain_english_curated("Use a good example.");
527        let matches = expr.iter_matches_in_doc(&doc).collect::<Vec<_>>();
528        assert_eq!(matches.to_strings(&doc), vec!["a", "good"]);
529    }
530
531    #[test]
532    fn test_noun_but_not_adjective() {
533        let expr = SequenceExpr::default()
534            .then_kind_is_but_is_not(TokenKind::is_noun, TokenKind::is_adjective);
535        let doc = Document::new_plain_english_curated("Use a good example.");
536        let matches = expr.iter_matches_in_doc(&doc).collect::<Vec<_>>();
537        assert_eq!(matches.to_strings(&doc), vec!["Use", "example"]);
538    }
539}