harper_core/expr/
sequence_expr.rs

1use paste::paste;
2
3use crate::{
4    CharStringExt, Span, Token, TokenKind,
5    expr::{FirstMatchOf, FixedPhrase, LongestMatchOf},
6    patterns::{AnyPattern, IndefiniteArticle, WhitespacePattern, Word, WordSet},
7};
8
9use super::{Expr, Optional, OwnedExprExt, Repeating, Step, UnlessStep};
10
11#[derive(Default)]
12pub struct SequenceExpr {
13    exprs: Vec<Box<dyn Expr>>,
14}
15
16/// Generate a `then_*` method from an available `is_*` function on [`TokenKind`].
17macro_rules! gen_then_from_is {
18    ($quality:ident) => {
19        paste! {
20            #[doc = concat!("Adds a step matching a token where [`TokenKind::is_", stringify!($quality), "()`] returns true.")]
21            pub fn [< then_$quality >] (self) -> Self{
22                self.then_kind_where(|kind| {
23                    kind.[< is_$quality >]()
24                })
25            }
26
27            #[doc = concat!("Adds an optional step matching a token where [`TokenKind::is_", stringify!($quality), "()`] returns true.")]
28            pub fn [< then_optional_$quality >] (self) -> Self{
29                self.then_optional(|tok: &Token, _source: &[char]| {
30                    tok.kind.[< is_$quality >]()
31                })
32            }
33
34            #[doc = concat!("Adds a step matching one or more consecutive tokens where [`TokenKind::is_", stringify!($quality), "()`] returns true.")]
35            pub fn [< then_one_or_more_$quality s >] (self) -> Self{
36                self.then_one_or_more(Box::new(|tok: &Token, _source: &[char]| {
37                    tok.kind.[< is_$quality >]()
38                }))
39            }
40
41            #[doc = concat!("Adds a step matching a token where [`TokenKind::is_", stringify!($quality), "()`] returns false.")]
42            pub fn [< then_anything_but_$quality >] (self) -> Self{
43                self.then_kind_where(|kind| {
44                    !kind.[< is_$quality >]()
45                })
46            }
47        }
48    };
49}
50
51impl Expr for SequenceExpr {
52    /// Run the expression starting at an index, returning the total matched window.
53    ///
54    /// If any step returns `None`, the entire expression does as well.
55    fn run(&self, mut cursor: usize, tokens: &[Token], source: &[char]) -> Option<Span<Token>> {
56        let mut window = Span::new_with_len(cursor, 0);
57
58        for cur_expr in &self.exprs {
59            let out = cur_expr.run(cursor, tokens, source)?;
60
61            // Only expand the window if the match actually covers some tokens
62            if out.end > out.start {
63                window.expand_to_include(out.start);
64                window.expand_to_include(out.end.checked_sub(1).unwrap_or(out.start));
65            }
66
67            // Only advance cursor if we actually matched something
68            if out.end > cursor {
69                cursor = out.end;
70            } else if out.start < cursor {
71                cursor = out.start;
72            }
73            // If both start and end are equal to cursor, don't move the cursor
74        }
75
76        Some(window)
77    }
78}
79
80impl SequenceExpr {
81    // Constructor methods
82
83    // Single word token methods
84
85    /// Construct a new sequence with a [`Word`] at the beginning of the operation list.
86    pub fn any_capitalization_of(word: &'static str) -> Self {
87        Self::default().then_any_capitalization_of(word)
88    }
89
90    /// Shorthand for [`Self::any_capitalization_of`].
91    pub fn aco(word: &'static str) -> Self {
92        Self::any_capitalization_of(word)
93    }
94
95    /// Match any word from the given set of words, case-insensitive.
96    pub fn word_set(words: &'static [&'static str]) -> Self {
97        Self::default().then_word_set(words)
98    }
99
100    /// Match any word.
101    pub fn any_word() -> Self {
102        Self::default().then_any_word()
103    }
104
105    // Expressions of more than one token
106
107    /// Match a fixed phrase.
108    pub fn fixed_phrase(phrase: &'static str) -> Self {
109        Self::default().then_fixed_phrase(phrase)
110    }
111
112    // Multiple expressions
113
114    /// Match the first of multiple expressions.
115    pub fn any_of(exprs: Vec<Box<dyn Expr>>) -> Self {
116        Self::default().then_any_of(exprs)
117    }
118
119    /// Will be accepted unless the condition matches.
120    pub fn unless(condition: impl Expr + 'static) -> Self {
121        Self::default().then_unless(condition)
122    }
123
124    // Builder methods
125
126    /// Push an [expression](Expr) to the operation list.
127    pub fn then(mut self, expr: impl Expr + 'static) -> Self {
128        self.exprs.push(Box::new(expr));
129        self
130    }
131
132    /// Pushes an expression that could move the cursor to the sequence, but does not require it.
133    pub fn then_optional(mut self, expr: impl Expr + 'static) -> Self {
134        self.exprs.push(Box::new(Optional::new(expr)));
135        self
136    }
137
138    /// Pushes an expression that will match any of the provided expressions.
139    ///
140    /// If more than one of the provided expressions match, this function provides no guarantee
141    /// as to which match will end up being used. If you need to get the longest of multiple
142    /// matches, use [`Self::then_longest_of()`] instead.
143    pub fn then_any_of(mut self, exprs: Vec<Box<dyn Expr>>) -> Self {
144        self.exprs.push(Box::new(FirstMatchOf::new(exprs)));
145        self
146    }
147
148    /// Pushes an expression that will match the longest of the provided expressions.
149    ///
150    /// If you don't need the longest match, prefer using the short-circuiting
151    /// [`Self::then_any_of()`] instead.
152    pub fn then_longest_of(mut self, exprs: Vec<Box<dyn Expr>>) -> Self {
153        self.exprs.push(Box::new(LongestMatchOf::new(exprs)));
154        self
155    }
156
157    /// Appends the steps in `other` onto the end of `self`.
158    /// This is more efficient than [`Self::then`] because it avoids pointer redirection.
159    pub fn then_seq(mut self, mut other: Self) -> Self {
160        self.exprs.append(&mut other.exprs);
161        self
162    }
163
164    /// Pushes an expression that will match any word from the given set of words, case-insensitive.
165    pub fn then_word_set(self, words: &'static [&'static str]) -> Self {
166        self.then(WordSet::new(words))
167    }
168
169    /// Match against one or more whitespace tokens.
170    pub fn then_whitespace(self) -> Self {
171        self.then(WhitespacePattern)
172    }
173
174    /// Match against one or more whitespace tokens.
175    pub fn then_whitespace_or_hyphen(self) -> Self {
176        self.then(WhitespacePattern.or(|tok: &Token, _: &[char]| tok.kind.is_hyphen()))
177    }
178
179    /// Shorthand for [`Self::then_whitespace_or_hyphen`].
180    pub fn t_ws_h(self) -> Self {
181        self.then_whitespace_or_hyphen()
182    }
183
184    /// Shorthand for [`Self::then_whitespace`].
185    pub fn t_ws(self) -> Self {
186        self.then_whitespace()
187    }
188
189    pub fn then_one_or_more(self, expr: impl Expr + 'static) -> Self {
190        self.then(Repeating::new(Box::new(expr), 1))
191    }
192
193    /// Create a new condition that will step one token forward if met.
194    /// If the condition is _not_ met, the whole expression returns `None`.
195    ///
196    /// This can be used to build out exceptions to other rules.
197    ///
198    /// See [`UnlessStep`] for more info.
199    pub fn then_unless(self, condition: impl Expr + 'static) -> Self {
200        self.then(UnlessStep::new(condition, |_tok: &Token, _src: &[char]| {
201            true
202        }))
203    }
204
205    /// Match any single token.
206    ///
207    /// See [`AnyPattern`] for more info.
208    pub fn then_anything(self) -> Self {
209        self.then(AnyPattern)
210    }
211
212    /// Match any single token.
213    ///
214    /// Shorthand for [`Self::then_anything`].
215    pub fn t_any(self) -> Self {
216        self.then_anything()
217    }
218
219    // Word matching methods
220
221    /// Matches any word.
222    pub fn then_any_word(self) -> Self {
223        self.then_kind_where(|kind| kind.is_word())
224    }
225
226    /// Match examples of `word` that have any capitalization.
227    pub fn then_any_capitalization_of(self, word: &'static str) -> Self {
228        self.then(Word::new(word))
229    }
230
231    /// Shorthand for [`Self::then_any_capitalization_of`].
232    pub fn t_aco(self, word: &'static str) -> Self {
233        self.then_any_capitalization_of(word)
234    }
235
236    /// Match examples of `word` case-sensitively.
237    pub fn then_exact_word(self, word: &'static str) -> Self {
238        self.then(Word::new_exact(word))
239    }
240
241    /// Match a fixed phrase.
242    pub fn then_fixed_phrase(self, phrase: &'static str) -> Self {
243        self.then(FixedPhrase::from_phrase(phrase))
244    }
245
246    /// Match any word except the ones in `words`.
247    pub fn then_word_except(self, words: &'static [&'static str]) -> Self {
248        self.then(move |tok: &Token, src: &[char]| {
249            !tok.kind.is_word()
250                || !words
251                    .iter()
252                    .any(|&word| tok.span.get_content(src).eq_ignore_ascii_case_str(word))
253        })
254    }
255
256    // Token kind/predicate matching methods
257
258    // One kind
259
260    /// Matches any token whose `Kind` exactly matches.
261    pub fn then_kind(self, kind: TokenKind) -> Self {
262        self.then_kind_where(move |k| kind == *k)
263    }
264
265    /// Matches a token where the provided closure returns true for the token's kind.
266    pub fn then_kind_where<F>(mut self, predicate: F) -> Self
267    where
268        F: Fn(&TokenKind) -> bool + Send + Sync + 'static,
269    {
270        self.exprs
271            .push(Box::new(move |tok: &Token, _source: &[char]| {
272                predicate(&tok.kind)
273            }));
274        self
275    }
276
277    /// Match a token of a given kind which is not in the list of words.
278    pub fn then_kind_except<F>(self, pred_is: F, ex: &'static [&'static str]) -> Self
279    where
280        F: Fn(&TokenKind) -> bool + Send + Sync + 'static,
281    {
282        self.then(move |tok: &Token, src: &[char]| {
283            pred_is(&tok.kind)
284                && !ex
285                    .iter()
286                    .any(|&word| tok.span.get_content(src).eq_ignore_ascii_case_str(word))
287        })
288    }
289
290    // Two kinds
291
292    /// Match a token where both token kind predicates return true.
293    /// For instance, a word that can be both noun and verb.
294    pub fn then_kind_both<F1, F2>(self, pred_is_1: F1, pred_is_2: F2) -> Self
295    where
296        F1: Fn(&TokenKind) -> bool + Send + Sync + 'static,
297        F2: Fn(&TokenKind) -> bool + Send + Sync + 'static,
298    {
299        self.then_kind_where(move |k| pred_is_1(k) && pred_is_2(k))
300    }
301
302    /// Match a token where either of the two token kind predicates returns true.
303    /// For instance, an adjective or an adverb.
304    pub fn then_kind_either<F1, F2>(self, pred_is_1: F1, pred_is_2: F2) -> Self
305    where
306        F1: Fn(&TokenKind) -> bool + Send + Sync + 'static,
307        F2: Fn(&TokenKind) -> bool + Send + Sync + 'static,
308    {
309        self.then_kind_where(move |k| pred_is_1(k) || pred_is_2(k))
310    }
311
312    /// Match a token where neither of the two token kind predicates returns true.
313    /// For instance, a word that can't be a verb or a noun.
314    pub fn then_kind_neither<F1, F2>(self, pred_isnt_1: F1, pred_isnt_2: F2) -> Self
315    where
316        F1: Fn(&TokenKind) -> bool + Send + Sync + 'static,
317        F2: Fn(&TokenKind) -> bool + Send + Sync + 'static,
318    {
319        self.then_kind_where(move |k| !pred_isnt_1(k) && !pred_isnt_2(k))
320    }
321
322    /// Match a token where the first token kind predicate returns true and the second returns false.
323    /// For instance, a word that can be a noun but cannot be a verb.
324    pub fn then_kind_is_but_is_not<F1, F2>(self, pred_is: F1, pred_not: F2) -> Self
325    where
326        F1: Fn(&TokenKind) -> bool + Send + Sync + 'static,
327        F2: Fn(&TokenKind) -> bool + Send + Sync + 'static,
328    {
329        self.then_kind_where(move |k| pred_is(k) && !pred_not(k))
330    }
331
332    /// Match a token where the first token kind predicate returns true and all of the second return false.
333    /// For instance, a word that can be a verb but not a noun or an adjective.
334    pub fn then_kind_is_but_isnt_any_of<F1, F2>(
335        self,
336        pred_is: F1,
337        preds_isnt: &'static [F2],
338    ) -> Self
339    where
340        F1: Fn(&TokenKind) -> bool + Send + Sync + 'static,
341        F2: Fn(&TokenKind) -> bool + Send + Sync + 'static,
342    {
343        self.then_kind_where(move |k| pred_is(k) && !preds_isnt.iter().any(|pred| pred(k)))
344    }
345
346    /// Match a token where the first token kind predicate returns true and the second returns false,
347    /// and the token is not in the list of exceptions.
348    pub fn then_kind_is_but_is_not_except<F1, F2>(
349        self,
350        pred_is: F1,
351        pred_not: F2,
352        ex: &'static [&'static str],
353    ) -> Self
354    where
355        F1: Fn(&TokenKind) -> bool + Send + Sync + 'static,
356        F2: Fn(&TokenKind) -> bool + Send + Sync + 'static,
357    {
358        self.then(move |tok: &Token, src: &[char]| {
359            pred_is(&tok.kind)
360                && !pred_not(&tok.kind)
361                && !ex
362                    .iter()
363                    .any(|&word| tok.span.get_content(src).eq_ignore_ascii_case_str(word))
364        })
365    }
366
367    gen_then_from_is!(sentence_terminator);
368    // More than two kinds
369
370    /// Match a token where any of the token kind predicates returns true.
371    /// Like `then_kind_either` but for more than two predicates.
372    pub fn then_kind_any<F>(self, preds_is: &'static [F]) -> Self
373    where
374        F: Fn(&TokenKind) -> bool + Send + Sync + 'static,
375    {
376        self.then_kind_where(move |k| preds_is.iter().any(|pred| pred(k)))
377    }
378
379    /// Match a token where none of the token kind predicates returns true.
380    /// Like `then_kind_neither` but for more than two predicates.
381    pub fn then_kind_none_of<F>(self, preds_isnt: &'static [F]) -> Self
382    where
383        F: Fn(&TokenKind) -> bool + Send + Sync + 'static,
384    {
385        self.then_kind_where(move |k| preds_isnt.iter().all(|pred| !pred(k)))
386    }
387
388    /// Match a token where any of the token kind predicates returns true,
389    /// and the word is not in the list of exceptions.
390    pub fn then_kind_any_except<F>(
391        self,
392        preds_is: &'static [F],
393        ex: &'static [&'static str],
394    ) -> Self
395    where
396        F: Fn(&TokenKind) -> bool + Send + Sync + 'static,
397    {
398        self.then(move |tok: &Token, src: &[char]| {
399            preds_is.iter().any(|pred| pred(&tok.kind))
400                && !ex
401                    .iter()
402                    .any(|&word| tok.span.get_content(src).eq_ignore_ascii_case_str(word))
403        })
404    }
405
406    /// Match a token where any of the token kind predicates returns true,
407    /// or the token is in the list of words.
408    pub fn then_kind_any_or_words<F>(
409        self,
410        preds: &'static [F],
411        words: &'static [&'static str],
412    ) -> Self
413    where
414        F: Fn(&TokenKind) -> bool + Send + Sync + 'static,
415    {
416        self.then(move |tok: &Token, src: &[char]| {
417            preds.iter().any(|pred| pred(&tok.kind))
418                || words
419                    .iter()
420                    .any(|&word| tok.span.get_content(src).eq_ignore_ascii_case_str(word))
421        })
422    }
423
424    /// Match a token where any of the first token kind predicates returns true,
425    /// the second returns false, and the token is not in the list of exceptions.    
426    pub fn then_kind_any_but_not_except<F1, F2>(
427        self,
428        preds_is: &'static [F1],
429        pred_not: F2,
430        ex: &'static [&'static str],
431    ) -> Self
432    where
433        F1: Fn(&TokenKind) -> bool + Send + Sync + 'static,
434        F2: Fn(&TokenKind) -> bool + Send + Sync + 'static,
435    {
436        self.then(move |tok: &Token, src: &[char]| {
437            preds_is.iter().any(|pred| pred(&tok.kind))
438                && !pred_not(&tok.kind)
439                && !ex
440                    .iter()
441                    .any(|&word| tok.span.get_content(src).eq_ignore_ascii_case_str(word))
442        })
443    }
444
445    // Word property matching methods
446
447    // Out-of-vocabulary word. (Words not in the dictionary)
448    gen_then_from_is!(oov);
449    gen_then_from_is!(swear);
450
451    // Part-of-speech matching methods
452
453    // Nominals (nouns and pronouns)
454
455    gen_then_from_is!(nominal);
456    gen_then_from_is!(plural_nominal);
457    gen_then_from_is!(non_plural_nominal);
458    gen_then_from_is!(possessive_nominal);
459
460    // Nouns
461
462    gen_then_from_is!(noun);
463    gen_then_from_is!(proper_noun);
464    gen_then_from_is!(plural_noun);
465    gen_then_from_is!(mass_noun_only);
466
467    // Pronouns
468
469    gen_then_from_is!(pronoun);
470    gen_then_from_is!(personal_pronoun);
471    gen_then_from_is!(first_person_singular_pronoun);
472    gen_then_from_is!(first_person_plural_pronoun);
473    gen_then_from_is!(second_person_pronoun);
474    gen_then_from_is!(third_person_pronoun);
475    gen_then_from_is!(third_person_singular_pronoun);
476    gen_then_from_is!(third_person_plural_pronoun);
477    gen_then_from_is!(subject_pronoun);
478    gen_then_from_is!(object_pronoun);
479
480    // Verbs
481
482    gen_then_from_is!(verb);
483    gen_then_from_is!(auxiliary_verb);
484    gen_then_from_is!(linking_verb);
485    gen_then_from_is!(verb_lemma);
486    gen_then_from_is!(verb_simple_past_form);
487    gen_then_from_is!(verb_past_participle_form);
488    gen_then_from_is!(verb_progressive_form);
489
490    // Adjectives
491
492    gen_then_from_is!(adjective);
493    gen_then_from_is!(positive_adjective);
494    gen_then_from_is!(comparative_adjective);
495    gen_then_from_is!(superlative_adjective);
496
497    // Adverbs
498
499    gen_then_from_is!(adverb);
500
501    // Determiners
502
503    gen_then_from_is!(determiner);
504    gen_then_from_is!(demonstrative_determiner);
505    gen_then_from_is!(possessive_determiner);
506    gen_then_from_is!(quantifier);
507    gen_then_from_is!(non_quantifier_determiner);
508    gen_then_from_is!(non_demonstrative_determiner);
509
510    /// Push an [`IndefiniteArticle`] to the end of the operation list.
511    pub fn then_indefinite_article(self) -> Self {
512        self.then(IndefiniteArticle::default())
513    }
514
515    // Other parts of speech
516
517    gen_then_from_is!(conjunction);
518    gen_then_from_is!(preposition);
519
520    // Numbers
521
522    gen_then_from_is!(number);
523    gen_then_from_is!(cardinal_number);
524    gen_then_from_is!(ordinal_number);
525
526    // Punctuation
527
528    gen_then_from_is!(punctuation);
529    gen_then_from_is!(apostrophe);
530    gen_then_from_is!(comma);
531    gen_then_from_is!(hyphen);
532    gen_then_from_is!(period);
533    gen_then_from_is!(semicolon);
534    gen_then_from_is!(quote);
535
536    // Other
537
538    gen_then_from_is!(case_separator);
539    gen_then_from_is!(likely_homograph);
540}
541
542impl<S> From<S> for SequenceExpr
543where
544    S: Step + 'static,
545{
546    fn from(step: S) -> Self {
547        Self {
548            exprs: vec![Box::new(step)],
549        }
550    }
551}
552
553#[cfg(test)]
554mod tests {
555    use crate::{
556        Document, TokenKind,
557        expr::{ExprExt, SequenceExpr},
558        linting::tests::SpanVecExt,
559    };
560
561    #[test]
562    fn test_kind_both() {
563        let noun_and_verb =
564            SequenceExpr::default().then_kind_both(TokenKind::is_noun, TokenKind::is_verb);
565        let doc = Document::new_plain_english_curated("Use a good example.");
566        let matches = noun_and_verb.iter_matches_in_doc(&doc).collect::<Vec<_>>();
567        assert_eq!(matches.to_strings(&doc), vec!["Use", "good", "example"]);
568    }
569
570    #[test]
571    fn test_adjective_or_determiner() {
572        let expr = SequenceExpr::default()
573            .then_kind_either(TokenKind::is_adjective, TokenKind::is_determiner);
574        let doc = Document::new_plain_english_curated("Use a good example.");
575        let matches = expr.iter_matches_in_doc(&doc).collect::<Vec<_>>();
576        assert_eq!(matches.to_strings(&doc), vec!["a", "good"]);
577    }
578
579    #[test]
580    fn test_noun_but_not_adjective() {
581        let expr = SequenceExpr::default()
582            .then_kind_is_but_is_not(TokenKind::is_noun, TokenKind::is_adjective);
583        let doc = Document::new_plain_english_curated("Use a good example.");
584        let matches = expr.iter_matches_in_doc(&doc).collect::<Vec<_>>();
585        assert_eq!(matches.to_strings(&doc), vec!["Use", "example"]);
586    }
587}