harper_core/expr/
sequence_expr.rs

1use paste::paste;
2
3use crate::{
4    CharStringExt, Span, Token, TokenKind,
5    expr::{FirstMatchOf, FixedPhrase, LongestMatchOf},
6    patterns::{AnyPattern, IndefiniteArticle, WhitespacePattern, Word, WordSet},
7};
8
9use super::{Expr, Optional, OwnedExprExt, Repeating, Step, UnlessStep};
10
11#[derive(Default)]
12pub struct SequenceExpr {
13    exprs: Vec<Box<dyn Expr>>,
14}
15
16/// Generate a `then_*` method from an available `is_*` function on [`TokenKind`].
17macro_rules! gen_then_from_is {
18    ($quality:ident) => {
19        paste! {
20            #[doc = concat!("Adds a step matching a token where [`TokenKind::is_", stringify!($quality), "()`] returns true.")]
21            pub fn [< then_$quality >] (self) -> Self{
22                self.then_kind_where(|kind| {
23                    kind.[< is_$quality >]()
24                })
25            }
26
27            #[doc = concat!("Adds an optional step matching a token where [`TokenKind::is_", stringify!($quality), "()`] returns true.")]
28            pub fn [< then_optional_$quality >] (self) -> Self{
29                self.then_optional(|tok: &Token, _source: &[char]| {
30                    tok.kind.[< is_$quality >]()
31                })
32            }
33
34            #[doc = concat!("Adds a step matching one or more consecutive tokens where [`TokenKind::is_", stringify!($quality), "()`] returns true.")]
35            pub fn [< then_one_or_more_$quality s >] (self) -> Self{
36                self.then_one_or_more(Box::new(|tok: &Token, _source: &[char]| {
37                    tok.kind.[< is_$quality >]()
38                }))
39            }
40
41            #[doc = concat!("Adds a step matching a token where [`TokenKind::is_", stringify!($quality), "()`] returns false.")]
42            pub fn [< then_anything_but_$quality >] (self) -> Self{
43                self.then_kind_where(|kind| {
44                    !kind.[< is_$quality >]()
45                })
46            }
47        }
48    };
49}
50
51impl Expr for SequenceExpr {
52    /// Run the expression starting at an index, returning the total matched window.
53    ///
54    /// If any step returns `None`, the entire expression does as well.
55    fn run(&self, mut cursor: usize, tokens: &[Token], source: &[char]) -> Option<Span<Token>> {
56        let mut window = Span::new_with_len(cursor, 0);
57
58        for cur_expr in &self.exprs {
59            let out = cur_expr.run(cursor, tokens, source)?;
60
61            // Only expand the window if the match actually covers some tokens
62            if out.end > out.start {
63                window.expand_to_include(out.start);
64                window.expand_to_include(out.end.checked_sub(1).unwrap_or(out.start));
65            }
66
67            // Only advance cursor if we actually matched something
68            if out.end > cursor {
69                cursor = out.end;
70            } else if out.start < cursor {
71                cursor = out.start;
72            }
73            // If both start and end are equal to cursor, don't move the cursor
74        }
75
76        Some(window)
77    }
78}
79
80impl SequenceExpr {
81    // Constructor methods
82
83    // Single token methods
84
85    /// Construct a new sequence with an [`AnyPattern`] at the beginning of the operation list.
86    pub fn anything() -> Self {
87        Self::default().then_anything()
88    }
89
90    // Single word token methods
91
92    /// Construct a new sequence with a [`Word`] at the beginning of the operation list.
93    pub fn any_capitalization_of(word: &'static str) -> Self {
94        Self::default().then_any_capitalization_of(word)
95    }
96
97    /// Shorthand for [`Self::any_capitalization_of`].
98    pub fn aco(word: &'static str) -> Self {
99        Self::any_capitalization_of(word)
100    }
101
102    /// Match any word from the given set of words, case-insensitive.
103    pub fn word_set(words: &'static [&'static str]) -> Self {
104        Self::default().then_word_set(words)
105    }
106
107    /// Match any word.
108    pub fn any_word() -> Self {
109        Self::default().then_any_word()
110    }
111
112    // Expressions of more than one token
113
114    /// Match a fixed phrase.
115    pub fn fixed_phrase(phrase: &'static str) -> Self {
116        Self::default().then_fixed_phrase(phrase)
117    }
118
119    // Multiple expressions
120
121    /// Match the first of multiple expressions.
122    pub fn any_of(exprs: Vec<Box<dyn Expr>>) -> Self {
123        Self::default().then_any_of(exprs)
124    }
125
126    /// Will be accepted unless the condition matches.
127    pub fn unless(condition: impl Expr + 'static) -> Self {
128        Self::default().then_unless(condition)
129    }
130
131    // Builder methods
132
133    /// Push an [expression](Expr) to the operation list.
134    pub fn then(mut self, expr: impl Expr + 'static) -> Self {
135        self.exprs.push(Box::new(expr));
136        self
137    }
138
139    /// Pushes an expression that could move the cursor to the sequence, but does not require it.
140    pub fn then_optional(mut self, expr: impl Expr + 'static) -> Self {
141        self.exprs.push(Box::new(Optional::new(expr)));
142        self
143    }
144
145    /// Pushes an expression that will match any of the provided expressions.
146    ///
147    /// If more than one of the provided expressions match, this function provides no guarantee
148    /// as to which match will end up being used. If you need to get the longest of multiple
149    /// matches, use [`Self::then_longest_of()`] instead.
150    pub fn then_any_of(mut self, exprs: Vec<Box<dyn Expr>>) -> Self {
151        self.exprs.push(Box::new(FirstMatchOf::new(exprs)));
152        self
153    }
154
155    /// Pushes an expression that will match the longest of the provided expressions.
156    ///
157    /// If you don't need the longest match, prefer using the short-circuiting
158    /// [`Self::then_any_of()`] instead.
159    pub fn then_longest_of(mut self, exprs: Vec<Box<dyn Expr>>) -> Self {
160        self.exprs.push(Box::new(LongestMatchOf::new(exprs)));
161        self
162    }
163
164    /// Appends the steps in `other` onto the end of `self`.
165    /// This is more efficient than [`Self::then`] because it avoids pointer redirection.
166    pub fn then_seq(mut self, mut other: Self) -> Self {
167        self.exprs.append(&mut other.exprs);
168        self
169    }
170
171    /// Pushes an expression that will match any word from the given set of words, case-insensitive.
172    pub fn then_word_set(self, words: &'static [&'static str]) -> Self {
173        self.then(WordSet::new(words))
174    }
175
176    /// Shorthand for [`Self::then_word_set`].
177    pub fn t_set(self, words: &'static [&'static str]) -> Self {
178        self.then_word_set(words)
179    }
180
181    /// Match against one or more whitespace tokens.
182    pub fn then_whitespace(self) -> Self {
183        self.then(WhitespacePattern)
184    }
185
186    /// Shorthand for [`Self::then_whitespace`].
187    pub fn t_ws(self) -> Self {
188        self.then_whitespace()
189    }
190
191    /// Match against one or more whitespace tokens.
192    pub fn then_whitespace_or_hyphen(self) -> Self {
193        self.then(WhitespacePattern.or(|tok: &Token, _: &[char]| tok.kind.is_hyphen()))
194    }
195
196    /// Shorthand for [`Self::then_whitespace_or_hyphen`].
197    pub fn t_ws_h(self) -> Self {
198        self.then_whitespace_or_hyphen()
199    }
200
201    pub fn then_one_or_more(self, expr: impl Expr + 'static) -> Self {
202        self.then(Repeating::new(Box::new(expr), 1))
203    }
204
205    /// Create a new condition that will step one token forward if met.
206    /// If the condition is _not_ met, the whole expression returns `None`.
207    ///
208    /// This can be used to build out exceptions to other rules.
209    ///
210    /// See [`UnlessStep`] for more info.
211    pub fn then_unless(self, condition: impl Expr + 'static) -> Self {
212        self.then(UnlessStep::new(condition, |_tok: &Token, _src: &[char]| {
213            true
214        }))
215    }
216
217    /// Match any single token.
218    ///
219    /// See [`AnyPattern`] for more info.
220    pub fn then_anything(self) -> Self {
221        self.then(AnyPattern)
222    }
223
224    /// Match any single token.
225    ///
226    /// Shorthand for [`Self::then_anything`].
227    pub fn t_any(self) -> Self {
228        self.then_anything()
229    }
230
231    // Word matching methods
232
233    /// Matches any word.
234    pub fn then_any_word(self) -> Self {
235        self.then_kind_where(|kind| kind.is_word())
236    }
237
238    /// Match examples of `word` that have any capitalization.
239    pub fn then_any_capitalization_of(self, word: &'static str) -> Self {
240        self.then(Word::new(word))
241    }
242
243    /// Shorthand for [`Self::then_any_capitalization_of`].
244    pub fn t_aco(self, word: &'static str) -> Self {
245        self.then_any_capitalization_of(word)
246    }
247
248    /// Match examples of `word` case-sensitively.
249    pub fn then_exact_word(self, word: &'static str) -> Self {
250        self.then(Word::new_exact(word))
251    }
252
253    /// Match a fixed phrase.
254    pub fn then_fixed_phrase(self, phrase: &'static str) -> Self {
255        self.then(FixedPhrase::from_phrase(phrase))
256    }
257
258    /// Match any word except the ones in `words`.
259    pub fn then_word_except(self, words: &'static [&'static str]) -> Self {
260        self.then(move |tok: &Token, src: &[char]| {
261            !tok.kind.is_word()
262                || !words
263                    .iter()
264                    .any(|&word| tok.span.get_content(src).eq_ignore_ascii_case_str(word))
265        })
266    }
267
268    // Token kind/predicate matching methods
269
270    // One kind
271
272    /// Matches any token whose `Kind` exactly matches.
273    pub fn then_kind(self, kind: TokenKind) -> Self {
274        self.then_kind_where(move |k| kind == *k)
275    }
276
277    /// Matches a token where the provided closure returns true for the token's kind.
278    pub fn then_kind_where<F>(mut self, predicate: F) -> Self
279    where
280        F: Fn(&TokenKind) -> bool + Send + Sync + 'static,
281    {
282        self.exprs
283            .push(Box::new(move |tok: &Token, _source: &[char]| {
284                predicate(&tok.kind)
285            }));
286        self
287    }
288
289    /// Match a token of a given kind which is not in the list of words.
290    pub fn then_kind_except<F>(self, pred_is: F, ex: &'static [&'static str]) -> Self
291    where
292        F: Fn(&TokenKind) -> bool + Send + Sync + 'static,
293    {
294        self.then(move |tok: &Token, src: &[char]| {
295            pred_is(&tok.kind)
296                && !ex
297                    .iter()
298                    .any(|&word| tok.span.get_content(src).eq_ignore_ascii_case_str(word))
299        })
300    }
301
302    // Two kinds
303
304    /// Match a token where both token kind predicates return true.
305    /// For instance, a word that can be both noun and verb.
306    pub fn then_kind_both<F1, F2>(self, pred_is_1: F1, pred_is_2: F2) -> Self
307    where
308        F1: Fn(&TokenKind) -> bool + Send + Sync + 'static,
309        F2: Fn(&TokenKind) -> bool + Send + Sync + 'static,
310    {
311        self.then_kind_where(move |k| pred_is_1(k) && pred_is_2(k))
312    }
313
314    /// Match a token where either of the two token kind predicates returns true.
315    /// For instance, an adjective or an adverb.
316    pub fn then_kind_either<F1, F2>(self, pred_is_1: F1, pred_is_2: F2) -> Self
317    where
318        F1: Fn(&TokenKind) -> bool + Send + Sync + 'static,
319        F2: Fn(&TokenKind) -> bool + Send + Sync + 'static,
320    {
321        self.then_kind_where(move |k| pred_is_1(k) || pred_is_2(k))
322    }
323
324    /// Match a token where neither of the two token kind predicates returns true.
325    /// For instance, a word that can't be a verb or a noun.
326    pub fn then_kind_neither<F1, F2>(self, pred_isnt_1: F1, pred_isnt_2: F2) -> Self
327    where
328        F1: Fn(&TokenKind) -> bool + Send + Sync + 'static,
329        F2: Fn(&TokenKind) -> bool + Send + Sync + 'static,
330    {
331        self.then_kind_where(move |k| !pred_isnt_1(k) && !pred_isnt_2(k))
332    }
333
334    /// Match a token where the first token kind predicate returns true and the second returns false.
335    /// For instance, a word that can be a noun but cannot be a verb.
336    pub fn then_kind_is_but_is_not<F1, F2>(self, pred_is: F1, pred_not: F2) -> Self
337    where
338        F1: Fn(&TokenKind) -> bool + Send + Sync + 'static,
339        F2: Fn(&TokenKind) -> bool + Send + Sync + 'static,
340    {
341        self.then_kind_where(move |k| pred_is(k) && !pred_not(k))
342    }
343
344    /// Match a token where the first token kind predicate returns true and the second returns false,
345    /// and the token is not in the list of exceptions.
346    pub fn then_kind_is_but_is_not_except<F1, F2>(
347        self,
348        pred_is: F1,
349        pred_not: F2,
350        ex: &'static [&'static str],
351    ) -> Self
352    where
353        F1: Fn(&TokenKind) -> bool + Send + Sync + 'static,
354        F2: Fn(&TokenKind) -> bool + Send + Sync + 'static,
355    {
356        self.then(move |tok: &Token, src: &[char]| {
357            pred_is(&tok.kind)
358                && !pred_not(&tok.kind)
359                && !ex
360                    .iter()
361                    .any(|&word| tok.span.get_content(src).eq_ignore_ascii_case_str(word))
362        })
363    }
364
365    /// Match a token where the first token kind predicate returns true and all of the second return false.
366    /// For instance, a word that can be a verb but not a noun or an adjective.
367    pub fn then_kind_is_but_isnt_any_of<F1, F2>(
368        self,
369        pred_is: F1,
370        preds_isnt: &'static [F2],
371    ) -> Self
372    where
373        F1: Fn(&TokenKind) -> bool + Send + Sync + 'static,
374        F2: Fn(&TokenKind) -> bool + Send + Sync + 'static,
375    {
376        self.then_kind_where(move |k| pred_is(k) && !preds_isnt.iter().any(|pred| pred(k)))
377    }
378
379    /// Match a token where the first token kind predicate returns true and all of the second return false,
380    /// and the token is not in the list of exceptions.
381    /// For instance, an adjective that isn't also a verb or adverb or the word "likely".
382    pub fn then_kind_is_but_isnt_any_of_except<F1, F2>(
383        self,
384        pred_is: F1,
385        preds_isnt: &'static [F2],
386        ex: &'static [&'static str],
387    ) -> Self
388    where
389        F1: Fn(&TokenKind) -> bool + Send + Sync + 'static,
390        F2: Fn(&TokenKind) -> bool + Send + Sync + 'static,
391    {
392        self.then(move |tok: &Token, src: &[char]| {
393            pred_is(&tok.kind)
394                && !preds_isnt.iter().any(|pred| pred(&tok.kind))
395                && !ex
396                    .iter()
397                    .any(|&word| tok.span.get_content(src).eq_ignore_ascii_case_str(word))
398        })
399    }
400
401    gen_then_from_is!(sentence_terminator);
402    // More than two kinds
403
404    /// Match a token where any of the token kind predicates returns true.
405    /// Like `then_kind_either` but for more than two predicates.
406    pub fn then_kind_any<F>(self, preds_is: &'static [F]) -> Self
407    where
408        F: Fn(&TokenKind) -> bool + Send + Sync + 'static,
409    {
410        self.then_kind_where(move |k| preds_is.iter().any(|pred| pred(k)))
411    }
412
413    /// Match a token where none of the token kind predicates returns true.
414    /// Like `then_kind_neither` but for more than two predicates.
415    pub fn then_kind_none_of<F>(self, preds_isnt: &'static [F]) -> Self
416    where
417        F: Fn(&TokenKind) -> bool + Send + Sync + 'static,
418    {
419        self.then_kind_where(move |k| preds_isnt.iter().all(|pred| !pred(k)))
420    }
421
422    /// Match a token where any of the token kind predicates returns true,
423    /// and the word is not in the list of exceptions.
424    pub fn then_kind_any_except<F>(
425        self,
426        preds_is: &'static [F],
427        ex: &'static [&'static str],
428    ) -> Self
429    where
430        F: Fn(&TokenKind) -> bool + Send + Sync + 'static,
431    {
432        self.then(move |tok: &Token, src: &[char]| {
433            preds_is.iter().any(|pred| pred(&tok.kind))
434                && !ex
435                    .iter()
436                    .any(|&word| tok.span.get_content(src).eq_ignore_ascii_case_str(word))
437        })
438    }
439
440    /// Match a token where any of the token kind predicates returns true,
441    /// or the token is in the list of words.
442    pub fn then_kind_any_or_words<F>(
443        self,
444        preds: &'static [F],
445        words: &'static [&'static str],
446    ) -> Self
447    where
448        F: Fn(&TokenKind) -> bool + Send + Sync + 'static,
449    {
450        self.then(move |tok: &Token, src: &[char]| {
451            preds.iter().any(|pred| pred(&tok.kind))
452                || words
453                    .iter()
454                    .any(|&word| tok.span.get_content(src).eq_ignore_ascii_case_str(word))
455        })
456    }
457
458    /// Match a token where any of the first token kind predicates returns true,
459    /// the second returns false, and the token is not in the list of exceptions.    
460    pub fn then_kind_any_but_not_except<F1, F2>(
461        self,
462        preds_is: &'static [F1],
463        pred_not: F2,
464        ex: &'static [&'static str],
465    ) -> Self
466    where
467        F1: Fn(&TokenKind) -> bool + Send + Sync + 'static,
468        F2: Fn(&TokenKind) -> bool + Send + Sync + 'static,
469    {
470        self.then(move |tok: &Token, src: &[char]| {
471            preds_is.iter().any(|pred| pred(&tok.kind))
472                && !pred_not(&tok.kind)
473                && !ex
474                    .iter()
475                    .any(|&word| tok.span.get_content(src).eq_ignore_ascii_case_str(word))
476        })
477    }
478
479    // Word property matching methods
480
481    // Out-of-vocabulary word. (Words not in the dictionary)
482    gen_then_from_is!(oov);
483    gen_then_from_is!(swear);
484
485    // Part-of-speech matching methods
486
487    // Nominals (nouns and pronouns)
488
489    gen_then_from_is!(nominal);
490    gen_then_from_is!(plural_nominal);
491    gen_then_from_is!(non_plural_nominal);
492    gen_then_from_is!(possessive_nominal);
493
494    // Nouns
495
496    gen_then_from_is!(noun);
497    gen_then_from_is!(proper_noun);
498    gen_then_from_is!(plural_noun);
499    gen_then_from_is!(mass_noun_only);
500
501    // Pronouns
502
503    gen_then_from_is!(pronoun);
504    gen_then_from_is!(personal_pronoun);
505    gen_then_from_is!(first_person_singular_pronoun);
506    gen_then_from_is!(first_person_plural_pronoun);
507    gen_then_from_is!(second_person_pronoun);
508    gen_then_from_is!(third_person_pronoun);
509    gen_then_from_is!(third_person_singular_pronoun);
510    gen_then_from_is!(third_person_plural_pronoun);
511    gen_then_from_is!(subject_pronoun);
512    gen_then_from_is!(object_pronoun);
513
514    // Verbs
515
516    gen_then_from_is!(verb);
517    gen_then_from_is!(auxiliary_verb);
518    gen_then_from_is!(linking_verb);
519    gen_then_from_is!(verb_lemma);
520    gen_then_from_is!(verb_simple_past_form);
521    gen_then_from_is!(verb_past_participle_form);
522    gen_then_from_is!(verb_progressive_form);
523
524    // Adjectives
525
526    gen_then_from_is!(adjective);
527    gen_then_from_is!(positive_adjective);
528    gen_then_from_is!(comparative_adjective);
529    gen_then_from_is!(superlative_adjective);
530
531    // Adverbs
532
533    gen_then_from_is!(adverb);
534    gen_then_from_is!(frequency_adverb);
535
536    // Determiners
537
538    gen_then_from_is!(determiner);
539    gen_then_from_is!(demonstrative_determiner);
540    gen_then_from_is!(possessive_determiner);
541    gen_then_from_is!(quantifier);
542    gen_then_from_is!(non_quantifier_determiner);
543    gen_then_from_is!(non_demonstrative_determiner);
544
545    /// Push an [`IndefiniteArticle`] to the end of the operation list.
546    pub fn then_indefinite_article(self) -> Self {
547        self.then(IndefiniteArticle::default())
548    }
549
550    // Other parts of speech
551
552    gen_then_from_is!(conjunction);
553    gen_then_from_is!(preposition);
554
555    // Numbers
556
557    gen_then_from_is!(number);
558    gen_then_from_is!(cardinal_number);
559    gen_then_from_is!(ordinal_number);
560
561    // Punctuation
562
563    gen_then_from_is!(punctuation);
564    gen_then_from_is!(apostrophe);
565    gen_then_from_is!(comma);
566    gen_then_from_is!(hyphen);
567    gen_then_from_is!(period);
568    gen_then_from_is!(semicolon);
569    gen_then_from_is!(quote);
570
571    // Other
572
573    gen_then_from_is!(case_separator);
574    gen_then_from_is!(likely_homograph);
575}
576
577impl<S> From<S> for SequenceExpr
578where
579    S: Step + 'static,
580{
581    fn from(step: S) -> Self {
582        Self {
583            exprs: vec![Box::new(step)],
584        }
585    }
586}
587
588#[cfg(test)]
589mod tests {
590    use crate::{
591        Document, TokenKind,
592        expr::{ExprExt, SequenceExpr},
593        linting::tests::SpanVecExt,
594    };
595
596    #[test]
597    fn test_kind_both() {
598        let noun_and_verb =
599            SequenceExpr::default().then_kind_both(TokenKind::is_noun, TokenKind::is_verb);
600        let doc = Document::new_plain_english_curated("Use a good example.");
601        let matches = noun_and_verb.iter_matches_in_doc(&doc).collect::<Vec<_>>();
602        assert_eq!(matches.to_strings(&doc), vec!["Use", "good", "example"]);
603    }
604
605    #[test]
606    fn test_adjective_or_determiner() {
607        let expr = SequenceExpr::default()
608            .then_kind_either(TokenKind::is_adjective, TokenKind::is_determiner);
609        let doc = Document::new_plain_english_curated("Use a good example.");
610        let matches = expr.iter_matches_in_doc(&doc).collect::<Vec<_>>();
611        assert_eq!(matches.to_strings(&doc), vec!["a", "good"]);
612    }
613
614    #[test]
615    fn test_noun_but_not_adjective() {
616        let expr = SequenceExpr::default()
617            .then_kind_is_but_is_not(TokenKind::is_noun, TokenKind::is_adjective);
618        let doc = Document::new_plain_english_curated("Use a good example.");
619        let matches = expr.iter_matches_in_doc(&doc).collect::<Vec<_>>();
620        assert_eq!(matches.to_strings(&doc), vec!["Use", "example"]);
621    }
622}