harper_core/expr/
spelled_number_expr.rs

1use crate::expr::LongestMatchOf;
2use crate::patterns::{WhitespacePattern, WordSet};
3use crate::{Span, Token};
4
5use super::{Expr, SequenceExpr};
6
7/// Matches spelled-out numbers from one to ninety-nine
8#[derive(Default)]
9pub struct SpelledNumberExpr;
10
11impl Expr for SpelledNumberExpr {
12    fn run(&self, cursor: usize, tokens: &[Token], source: &[char]) -> Option<Span<Token>> {
13        if tokens.is_empty() {
14            return None;
15        }
16
17        // The numbers that can be in the 2nd position of a compound number.
18        // A subset of the standalone numbers since we can't say "twenty zero" or "twenty eleven"
19        // "Zero" and "ten" don't belong: twenty-one ✅ twenty-zero ❌ twenty-ten ❌
20        let units = &[
21            "one", "two", "three", "four", "five", "six", "seven", "eight", "nine",
22        ];
23
24        // These can't make a compound with `tens` but they can stand alone
25        let teens = &[
26            "ten",
27            "eleven",
28            "twelve",
29            "thirteen",
30            "fourteen",
31            "fifteen",
32            "sixteen",
33            "seventeen",
34            "eighteen",
35            "nineteen",
36        ];
37
38        // These can make a compound with the part_2 standalones above.
39        // "Ten" and "hundred" don't belong: twenty-one ✅ ten-one ❌ hundred-one ❌
40        let tens = &[
41            "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety",
42        ];
43
44        let single_words = WordSet::new(
45            &units
46                .iter()
47                .chain(teens.iter())
48                .chain(tens.iter())
49                .copied()
50                .chain(std::iter::once("zero"))
51                .collect::<Vec<&str>>(),
52        );
53
54        let tens_units_compounds = SequenceExpr::default()
55            .then(WordSet::new(tens))
56            .then_any_of(vec![
57                Box::new(|t: &Token, _s: &[char]| t.kind.is_hyphen()),
58                Box::new(WhitespacePattern),
59            ])
60            .then(WordSet::new(units));
61
62        let expr =
63            LongestMatchOf::new(vec![Box::new(single_words), Box::new(tens_units_compounds)]);
64
65        expr.run(cursor, tokens, source)
66    }
67}
68
69#[cfg(test)]
70mod tests {
71    use super::SpelledNumberExpr;
72    use crate::Document;
73    use crate::expr::ExprExt;
74    use crate::linting::tests::SpanVecExt;
75
76    #[test]
77    fn matches_single_digit() {
78        let doc = Document::new_markdown_default_curated("one two three");
79        let matches = SpelledNumberExpr.iter_matches_in_doc(&doc);
80        assert_eq!(matches.count(), 3);
81    }
82
83    #[test]
84    fn matches_teens() {
85        let doc = Document::new_markdown_default_curated("ten eleven twelve");
86        let matches = SpelledNumberExpr.iter_matches_in_doc(&doc);
87        assert_eq!(matches.count(), 3);
88    }
89
90    #[test]
91    fn matches_tens() {
92        let doc = Document::new_markdown_default_curated("twenty thirty forty");
93        let matches = SpelledNumberExpr.iter_matches_in_doc(&doc);
94        assert_eq!(matches.count(), 3);
95    }
96
97    #[test]
98    fn matches_compound_numbers() {
99        let doc = Document::new_markdown_default_curated("twenty-one thirty-two");
100        let matches = SpelledNumberExpr
101            .iter_matches_in_doc(&doc)
102            .collect::<Vec<_>>();
103
104        // Debug output
105        println!("Found {} matches:", matches.len());
106        for m in &matches {
107            let text: String = doc.get_tokens()[m.start..m.end]
108                .iter()
109                .map(|t| doc.get_span_content_str(&t.span))
110                .collect();
111            println!("- '{text}' (span: {m:?})");
112        }
113
114        assert_eq!(matches.len(), 2);
115    }
116
117    #[test]
118    fn deep_thought() {
119        let doc = Document::new_markdown_default_curated(
120            "the answer to the ultimate question of life, the universe, and everything is forty-two",
121        );
122        let matches = SpelledNumberExpr
123            .iter_matches_in_doc(&doc)
124            .collect::<Vec<_>>();
125
126        dbg!(&matches);
127        dbg!(matches.to_strings(&doc));
128
129        assert_eq!(matches.to_strings(&doc), vec!["forty-two"]);
130    }
131
132    #[test]
133    fn jacksons() {
134        let doc = Document::new_markdown_default_curated(
135            "A, B, C It's easy as one, two, three. Or simple as Do-Re-Mi",
136        );
137        let matches = SpelledNumberExpr
138            .iter_matches_in_doc(&doc)
139            .collect::<Vec<_>>();
140
141        assert_eq!(matches.to_strings(&doc), vec!["one", "two", "three"]);
142    }
143
144    #[test]
145    fn orwell() {
146        let doc = Document::new_markdown_default_curated("Nineteen Eighty-Four");
147        let matches = SpelledNumberExpr
148            .iter_matches_in_doc(&doc)
149            .collect::<Vec<_>>();
150
151        assert_eq!(matches.to_strings(&doc), vec!["Nineteen", "Eighty-Four"]);
152    }
153
154    #[test]
155    fn get_smart() {
156        let doc = Document::new_markdown_default_curated(
157            "Maxwell Smart was Agent Eighty-Six, but who was Agent Ninety-Nine?",
158        );
159        let matches = SpelledNumberExpr
160            .iter_matches_in_doc(&doc)
161            .collect::<Vec<_>>();
162
163        assert_eq!(matches.to_strings(&doc), vec!["Eighty-Six", "Ninety-Nine"]);
164    }
165
166    #[test]
167    fn hyphens_or_spaces() {
168        let doc = Document::new_markdown_default_curated(
169            "twenty-one, thirty two, forty-three, fifty four, sixty-five, seventy six, eighty-seven, ninety eight",
170        );
171        let matches = SpelledNumberExpr
172            .iter_matches_in_doc(&doc)
173            .collect::<Vec<_>>();
174
175        assert_eq!(
176            matches.to_strings(&doc),
177            vec![
178                "twenty-one",
179                "thirty two",
180                "forty-three",
181                "fifty four",
182                "sixty-five",
183                "seventy six",
184                "eighty-seven",
185                "ninety eight",
186            ]
187        );
188    }
189
190    #[test]
191    fn waiting_since() {
192        let doc = Document::new_markdown_default_curated("I have been waiting since two hours.");
193        let matches = SpelledNumberExpr
194            .iter_matches_in_doc(&doc)
195            .collect::<Vec<_>>();
196
197        assert_eq!(matches.to_strings(&doc), vec!["two"]);
198    }
199}