Skip to main content

harper_core/expr/
spelled_number_expr.rs

1use crate::expr::LongestMatchOf;
2use crate::patterns::{WhitespacePattern, WordSet};
3use crate::{Span, Token};
4
5use super::{Expr, SequenceExpr};
6
7/// Matches spelled-out numbers from one to ninety-nine
8#[derive(Default)]
9pub struct SpelledNumberExpr;
10
11impl Expr for SpelledNumberExpr {
12    fn run(&self, cursor: usize, tokens: &[Token], source: &[char]) -> Option<Span<Token>> {
13        if tokens.is_empty() {
14            return None;
15        }
16
17        // The numbers that can be in the 2nd position of a compound number.
18        // A subset of the standalone numbers since we can't say "twenty zero" or "twenty eleven"
19        // "Zero" and "ten" don't belong: twenty-one ✅ twenty-zero ❌ twenty-ten ❌
20        let units = &[
21            "one", "two", "three", "four", "five", "six", "seven", "eight", "nine",
22        ];
23
24        // These can't make a compound with `tens` but they can stand alone
25        let teens = &[
26            "ten",
27            "eleven",
28            "twelve",
29            "thirteen",
30            "fourteen",
31            "fifteen",
32            "sixteen",
33            "seventeen",
34            "eighteen",
35            "nineteen",
36        ];
37
38        // These can make a compound with the part_2 standalones above.
39        // "Ten" and "hundred" don't belong: twenty-one ✅ ten-one ❌ hundred-one ❌
40        let tens = &[
41            "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety",
42        ];
43
44        let single_words = WordSet::new(
45            &units
46                .iter()
47                .chain(teens.iter())
48                .chain(tens.iter())
49                .copied()
50                .chain(std::iter::once("zero"))
51                .collect::<Vec<&str>>(),
52        );
53
54        let tens_units_compounds = SequenceExpr::word_set(tens)
55            .then_any_of(vec![
56                Box::new(|t: &Token, _s: &[char]| t.kind.is_hyphen()),
57                Box::new(WhitespacePattern),
58            ])
59            .then_word_set(units);
60
61        let expr =
62            LongestMatchOf::new(vec![Box::new(single_words), Box::new(tens_units_compounds)]);
63
64        expr.run(cursor, tokens, source)
65    }
66}
67
68#[cfg(test)]
69mod tests {
70    use super::SpelledNumberExpr;
71    use crate::Document;
72    use crate::expr::ExprExt;
73    use crate::linting::tests::SpanVecExt;
74
75    #[test]
76    fn matches_single_digit() {
77        let doc = Document::new_markdown_default_curated("one two three");
78        let matches = SpelledNumberExpr.iter_matches_in_doc(&doc);
79        assert_eq!(matches.count(), 3);
80    }
81
82    #[test]
83    fn matches_teens() {
84        let doc = Document::new_markdown_default_curated("ten eleven twelve");
85        let matches = SpelledNumberExpr.iter_matches_in_doc(&doc);
86        assert_eq!(matches.count(), 3);
87    }
88
89    #[test]
90    fn matches_tens() {
91        let doc = Document::new_markdown_default_curated("twenty thirty forty");
92        let matches = SpelledNumberExpr.iter_matches_in_doc(&doc);
93        assert_eq!(matches.count(), 3);
94    }
95
96    #[test]
97    fn matches_compound_numbers() {
98        let doc = Document::new_markdown_default_curated("twenty-one thirty-two");
99        let matches = SpelledNumberExpr
100            .iter_matches_in_doc(&doc)
101            .collect::<Vec<_>>();
102
103        // Debug output
104        println!("Found {} matches:", matches.len());
105        for m in &matches {
106            let text: String = doc.get_tokens()[m.start..m.end]
107                .iter()
108                .map(|t| doc.get_span_content_str(&t.span))
109                .collect();
110            println!("- '{text}' (span: {m:?})");
111        }
112
113        assert_eq!(matches.len(), 2);
114    }
115
116    #[test]
117    fn deep_thought() {
118        let doc = Document::new_markdown_default_curated(
119            "the answer to the ultimate question of life, the universe, and everything is forty-two",
120        );
121        let matches = SpelledNumberExpr
122            .iter_matches_in_doc(&doc)
123            .collect::<Vec<_>>();
124
125        dbg!(&matches);
126        dbg!(matches.to_strings(&doc));
127
128        assert_eq!(matches.to_strings(&doc), vec!["forty-two"]);
129    }
130
131    #[test]
132    fn jacksons() {
133        let doc = Document::new_markdown_default_curated(
134            "A, B, C It's easy as one, two, three. Or simple as Do-Re-Mi",
135        );
136        let matches = SpelledNumberExpr
137            .iter_matches_in_doc(&doc)
138            .collect::<Vec<_>>();
139
140        assert_eq!(matches.to_strings(&doc), vec!["one", "two", "three"]);
141    }
142
143    #[test]
144    fn orwell() {
145        let doc = Document::new_markdown_default_curated("Nineteen Eighty-Four");
146        let matches = SpelledNumberExpr
147            .iter_matches_in_doc(&doc)
148            .collect::<Vec<_>>();
149
150        assert_eq!(matches.to_strings(&doc), vec!["Nineteen", "Eighty-Four"]);
151    }
152
153    #[test]
154    fn get_smart() {
155        let doc = Document::new_markdown_default_curated(
156            "Maxwell Smart was Agent Eighty-Six, but who was Agent Ninety-Nine?",
157        );
158        let matches = SpelledNumberExpr
159            .iter_matches_in_doc(&doc)
160            .collect::<Vec<_>>();
161
162        assert_eq!(matches.to_strings(&doc), vec!["Eighty-Six", "Ninety-Nine"]);
163    }
164
165    #[test]
166    fn hyphens_or_spaces() {
167        let doc = Document::new_markdown_default_curated(
168            "twenty-one, thirty two, forty-three, fifty four, sixty-five, seventy six, eighty-seven, ninety eight",
169        );
170        let matches = SpelledNumberExpr
171            .iter_matches_in_doc(&doc)
172            .collect::<Vec<_>>();
173
174        assert_eq!(
175            matches.to_strings(&doc),
176            vec![
177                "twenty-one",
178                "thirty two",
179                "forty-three",
180                "fifty four",
181                "sixty-five",
182                "seventy six",
183                "eighty-seven",
184                "ninety eight",
185            ]
186        );
187    }
188
189    #[test]
190    fn waiting_since() {
191        let doc = Document::new_markdown_default_curated("I have been waiting since two hours.");
192        let matches = SpelledNumberExpr
193            .iter_matches_in_doc(&doc)
194            .collect::<Vec<_>>();
195
196        assert_eq!(matches.to_strings(&doc), vec!["two"]);
197    }
198}