harper_core/expr/
spelled_number_expr.rs

1use crate::expr::LongestMatchOf;
2use crate::patterns::{WhitespacePattern, WordSet};
3use crate::{Span, Token};
4
5use super::{Expr, SequenceExpr};
6
7/// Matches spelled-out numbers from one to ninety-nine
8#[derive(Default)]
9pub struct SpelledNumberExpr;
10
11impl Expr for SpelledNumberExpr {
12    fn run(&self, cursor: usize, tokens: &[Token], source: &[char]) -> Option<Span> {
13        if tokens.is_empty() {
14            return None;
15        }
16
17        // The numbers that can be in the 2nd position of a compound number.
18        // A subset of the standalone numbers since we can't say "twenty zero" or "twenty eleven"
19        // "Zero" and "ten" don't belong: twenty-one ✅ twenty-zero ❌ twenty-ten ❌
20        let units = &[
21            "one", "two", "three", "four", "five", "six", "seven", "eight", "nine",
22        ];
23
24        // These can't make a compound with `tens` but they can stand alone
25        let teens = &[
26            "ten",
27            "eleven",
28            "twelve",
29            "thirteen",
30            "fourteen",
31            "fifteen",
32            "sixteen",
33            "seventeen",
34            "eighteen",
35            "nineteen",
36        ];
37
38        // These can make a compound with the part_2 standalones above.
39        // "Ten" and "hundred" don't belong: twenty-one ✅ ten-one ❌ hundred-one ❌
40        let tens = &[
41            "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety",
42        ];
43
44        let single_words = WordSet::new(
45            &units
46                .iter()
47                .chain(teens.iter())
48                .chain(tens.iter())
49                .copied()
50                .chain(std::iter::once("zero"))
51                .collect::<Vec<&str>>(),
52        );
53
54        let tens_units_compounds = SequenceExpr::default()
55            .then(WordSet::new(tens))
56            .then(LongestMatchOf::new(vec![
57                Box::new(|t: &Token, _s: &[char]| t.kind.is_hyphen()),
58                Box::new(WhitespacePattern),
59            ]))
60            .then(WordSet::new(units));
61
62        let expr =
63            LongestMatchOf::new(vec![Box::new(single_words), Box::new(tens_units_compounds)]);
64
65        expr.run(cursor, tokens, source)
66    }
67}
68
69#[cfg(test)]
70mod tests {
71    use super::SpelledNumberExpr;
72    use crate::expr::ExprExt;
73    use crate::{Document, Span};
74
75    trait SpanVecExt {
76        fn to_strings(&self, doc: &Document) -> Vec<String>;
77    }
78
79    impl SpanVecExt for Vec<Span> {
80        fn to_strings(&self, doc: &Document) -> Vec<String> {
81            self.iter()
82                .map(|sp| {
83                    doc.get_tokens()[sp.start..sp.end]
84                        .iter()
85                        .map(|tok| doc.get_span_content_str(&tok.span))
86                        .collect::<String>()
87                })
88                .collect()
89        }
90    }
91
92    #[test]
93    fn matches_single_digit() {
94        let doc = Document::new_markdown_default_curated("one two three");
95        let matches = SpelledNumberExpr.iter_matches_in_doc(&doc);
96        assert_eq!(matches.count(), 3);
97    }
98
99    #[test]
100    fn matches_teens() {
101        let doc = Document::new_markdown_default_curated("ten eleven twelve");
102        let matches = SpelledNumberExpr.iter_matches_in_doc(&doc);
103        assert_eq!(matches.count(), 3);
104    }
105
106    #[test]
107    fn matches_tens() {
108        let doc = Document::new_markdown_default_curated("twenty thirty forty");
109        let matches = SpelledNumberExpr.iter_matches_in_doc(&doc);
110        assert_eq!(matches.count(), 3);
111    }
112
113    #[test]
114    fn matches_compound_numbers() {
115        let doc = Document::new_markdown_default_curated("twenty-one thirty-two");
116        let matches = SpelledNumberExpr
117            .iter_matches_in_doc(&doc)
118            .collect::<Vec<_>>();
119
120        // Debug output
121        println!("Found {} matches:", matches.len());
122        for m in &matches {
123            let text: String = doc.get_tokens()[m.start..m.end]
124                .iter()
125                .map(|t| doc.get_span_content_str(&t.span))
126                .collect();
127            println!("- '{}' (span: {:?})", text, m);
128        }
129
130        assert_eq!(matches.len(), 2);
131    }
132
133    #[test]
134    fn deep_thought() {
135        let doc = Document::new_markdown_default_curated(
136            "the answer to the ultimate question of life, the universe, and everything is forty-two",
137        );
138        let matches = SpelledNumberExpr
139            .iter_matches_in_doc(&doc)
140            .collect::<Vec<_>>();
141
142        dbg!(&matches);
143        dbg!(matches.to_strings(&doc));
144
145        assert_eq!(matches.to_strings(&doc), vec!["forty-two"]);
146    }
147
148    #[test]
149    fn jacksons() {
150        let doc = Document::new_markdown_default_curated(
151            "A, B, C It's easy as one, two, three. Or simple as Do-Re-Mi",
152        );
153        let matches = SpelledNumberExpr
154            .iter_matches_in_doc(&doc)
155            .collect::<Vec<_>>();
156
157        assert_eq!(matches.to_strings(&doc), vec!["one", "two", "three"]);
158    }
159
160    #[test]
161    fn orwell() {
162        let doc = Document::new_markdown_default_curated("Nineteen Eighty-Four");
163        let matches = SpelledNumberExpr
164            .iter_matches_in_doc(&doc)
165            .collect::<Vec<_>>();
166
167        assert_eq!(matches.to_strings(&doc), vec!["Nineteen", "Eighty-Four"]);
168    }
169
170    #[test]
171    fn get_smart() {
172        let doc = Document::new_markdown_default_curated(
173            "Maxwell Smart was Agent Eighty-Six, but who was Agent Ninety-Nine?",
174        );
175        let matches = SpelledNumberExpr
176            .iter_matches_in_doc(&doc)
177            .collect::<Vec<_>>();
178
179        assert_eq!(matches.to_strings(&doc), vec!["Eighty-Six", "Ninety-Nine"]);
180    }
181
182    #[test]
183    fn hyphens_or_spaces() {
184        let doc = Document::new_markdown_default_curated(
185            "twenty-one, thirty two, forty-three, fifty four, sixty-five, seventy six, eighty-seven, ninety eight",
186        );
187        let matches = SpelledNumberExpr
188            .iter_matches_in_doc(&doc)
189            .collect::<Vec<_>>();
190
191        assert_eq!(
192            matches.to_strings(&doc),
193            vec![
194                "twenty-one",
195                "thirty two",
196                "forty-three",
197                "fifty four",
198                "sixty-five",
199                "seventy six",
200                "eighty-seven",
201                "ninety eight",
202            ]
203        );
204    }
205
206    #[test]
207    fn waiting_since() {
208        let doc = Document::new_markdown_default_curated("I have been waiting since two hours.");
209        let matches = SpelledNumberExpr
210            .iter_matches_in_doc(&doc)
211            .collect::<Vec<_>>();
212
213        assert_eq!(matches.to_strings(&doc), vec!["two"]);
214    }
215}