Skip to main content

text_processing_rs/taggers/
cardinal.rs

1//! Cardinal number tagger.
2//!
3//! Converts spoken number words to digits:
4//! - "one" → "1"
5//! - "twenty one" → "21"
6//! - "one hundred twenty three" → "123"
7//! - "one thousand two hundred thirty four" → "1234"
8//! - "minus sixty" → "-60"
9
10use lazy_static::lazy_static;
11use std::collections::HashMap;
12
13lazy_static! {
14    /// Single digit and teen numbers
15    static ref ONES: HashMap<&'static str, i64> = {
16        let mut m = HashMap::new();
17        m.insert("zero", 0);
18        m.insert("one", 1);
19        m.insert("two", 2);
20        m.insert("three", 3);
21        m.insert("four", 4);
22        m.insert("five", 5);
23        m.insert("six", 6);
24        m.insert("seven", 7);
25        m.insert("eight", 8);
26        m.insert("nine", 9);
27        m.insert("ten", 10);
28        m.insert("eleven", 11);
29        m.insert("twelve", 12);
30        m.insert("thirteen", 13);
31        m.insert("fourteen", 14);
32        m.insert("fifteen", 15);
33        m.insert("sixteen", 16);
34        m.insert("seventeen", 17);
35        m.insert("eighteen", 18);
36        m.insert("nineteen", 19);
37        m
38    };
39
40    /// Tens (20, 30, 40, ...)
41    static ref TENS: HashMap<&'static str, i64> = {
42        let mut m = HashMap::new();
43        m.insert("twenty", 20);
44        m.insert("thirty", 30);
45        m.insert("forty", 40);
46        m.insert("fifty", 50);
47        m.insert("sixty", 60);
48        m.insert("seventy", 70);
49        m.insert("eighty", 80);
50        m.insert("ninety", 90);
51        m
52    };
53
54    /// Scale words (using i128 to support sextillion and larger)
55    static ref SCALES: HashMap<&'static str, i128> = {
56        let mut m = HashMap::new();
57        m.insert("hundred", 100);
58        m.insert("thousand", 1_000);
59        m.insert("million", 1_000_000);
60        m.insert("billion", 1_000_000_000);
61        m.insert("trillion", 1_000_000_000_000);
62        m.insert("quadrillion", 1_000_000_000_000_000);
63        m.insert("quintillion", 1_000_000_000_000_000_000);
64        m.insert("sextillion", 1_000_000_000_000_000_000_000_i128);
65        // Indian numbering system
66        m.insert("lakh", 100_000);
67        m.insert("crore", 10_000_000);
68        m
69    };
70}
71
72/// Parse spoken cardinal number to string representation.
73///
74/// Returns None if the input cannot be parsed as a number.
75pub fn parse(input: &str) -> Option<String> {
76    let input = input.to_lowercase();
77    let input = input.trim();
78
79    // Handle "zero" specially - NeMo returns "zero" not "0"
80    if input == "zero" {
81        return Some("zero".to_string());
82    }
83
84    // Check for negative
85    let (is_negative, rest) = if input.starts_with("minus ") {
86        (true, input.strip_prefix("minus ")?)
87    } else if input.starts_with("negative ") {
88        (true, input.strip_prefix("negative ")?)
89    } else {
90        (false, input)
91    };
92
93    let num = words_to_number(rest)?;
94
95    if is_negative {
96        Some(format!("-{}", num))
97    } else {
98        Some(num.to_string())
99    }
100}
101
102/// Convert spoken number words to integer.
103///
104/// Algorithm:
105/// 1. Tokenize input
106/// 2. Process left-to-right, accumulating values
107/// 3. Scale words (hundred, thousand, million) multiply the current accumulator
108/// 4. Handle "and" as a separator (ignored)
109///
110/// Examples:
111/// - "twenty one" → 20 + 1 = 21
112/// - "one hundred twenty three" → (1 * 100) + 20 + 3 = 123
113/// - "one thousand two hundred thirty four" → (1 * 1000) + (2 * 100) + 30 + 4 = 1234
114pub fn words_to_number(input: &str) -> Option<i128> {
115    let input = input.to_lowercase();
116    let words: Vec<&str> = input
117        .split_whitespace()
118        .filter(|w| *w != "and" && *w != "a")
119        .collect();
120
121    if words.is_empty() {
122        return None;
123    }
124
125    // Handle special case: "eleven hundred" = 1100
126    if words.len() == 2 && words[1] == "hundred" {
127        if let Some(&val) = ONES.get(words[0]) {
128            if val >= 11 && val <= 19 {
129                return Some((val * 100) as i128);
130            }
131        }
132        if let Some(&val) = TENS.get(words[0]) {
133            return Some((val * 100) as i128);
134        }
135    }
136
137    // Handle "eleven hundred twenty one" pattern
138    if words.len() >= 2 && words[1] == "hundred" {
139        if let Some(&first_val) = ONES.get(words[0]) {
140            if first_val >= 11 && first_val <= 99 {
141                let base = (first_val * 100) as i128;
142                if words.len() == 2 {
143                    return Some(base);
144                }
145                // Parse remaining words
146                let rest = words[2..].join(" ");
147                if let Some(remainder) = words_to_number(&rest) {
148                    return Some(base + remainder);
149                }
150            }
151        }
152        if let Some(&first_val) = TENS.get(words[0]) {
153            let base = (first_val * 100) as i128;
154            if words.len() == 2 {
155                return Some(base);
156            }
157            let rest = words[2..].join(" ");
158            if let Some(remainder) = words_to_number(&rest) {
159                return Some(base + remainder);
160            }
161        }
162    }
163
164    let mut result: i128 = 0;
165    let mut current: i128 = 0;
166    let mut found_number = false;
167
168    for word in words {
169        if let Some(&val) = ONES.get(word) {
170            current += val as i128;
171            found_number = true;
172        } else if let Some(&val) = TENS.get(word) {
173            current += val as i128;
174            found_number = true;
175        } else if word == "hundred" {
176            if current == 0 {
177                current = 1;
178            }
179            current *= 100;
180            found_number = true;
181        } else if let Some(&scale) = SCALES.get(word) {
182            if scale >= 1000 {
183                if current == 0 {
184                    current = 1;
185                }
186                current *= scale;
187                result += current;
188                current = 0;
189                found_number = true;
190            }
191        } else {
192            // Unknown word - not a valid number
193            return None;
194        }
195    }
196
197    if found_number {
198        Some(result + current)
199    } else {
200        None
201    }
202}
203
204#[cfg(test)]
205mod tests {
206    use super::*;
207
208    #[test]
209    fn test_ones() {
210        assert_eq!(parse("one"), Some("1".to_string()));
211        assert_eq!(parse("two"), Some("2".to_string()));
212        assert_eq!(parse("nine"), Some("9".to_string()));
213        assert_eq!(parse("ten"), Some("10".to_string()));
214        assert_eq!(parse("fifteen"), Some("15".to_string()));
215        assert_eq!(parse("nineteen"), Some("19".to_string()));
216    }
217
218    #[test]
219    fn test_tens() {
220        assert_eq!(parse("twenty"), Some("20".to_string()));
221        assert_eq!(parse("twenty one"), Some("21".to_string()));
222        assert_eq!(parse("forty two"), Some("42".to_string()));
223        assert_eq!(parse("ninety nine"), Some("99".to_string()));
224    }
225
226    #[test]
227    fn test_hundreds() {
228        assert_eq!(parse("one hundred"), Some("100".to_string()));
229        assert_eq!(parse("one hundred one"), Some("101".to_string()));
230        assert_eq!(parse("one hundred and one"), Some("101".to_string()));
231        assert_eq!(parse("two hundred twenty two"), Some("222".to_string()));
232    }
233
234    #[test]
235    fn test_eleven_hundred() {
236        assert_eq!(parse("eleven hundred"), Some("1100".to_string()));
237        assert_eq!(parse("twenty one hundred"), Some("2100".to_string()));
238        assert_eq!(parse("eleven hundred twenty one"), Some("1121".to_string()));
239    }
240
241    #[test]
242    fn test_thousands() {
243        assert_eq!(parse("one thousand"), Some("1000".to_string()));
244        assert_eq!(parse("one thousand one"), Some("1001".to_string()));
245        assert_eq!(parse("one thousand one hundred"), Some("1100".to_string()));
246        assert_eq!(
247            parse("one thousand two hundred thirty four"),
248            Some("1234".to_string())
249        );
250    }
251
252    #[test]
253    fn test_millions() {
254        assert_eq!(parse("one million"), Some("1000000".to_string()));
255        assert_eq!(parse("two million three"), Some("2000003".to_string()));
256    }
257
258    #[test]
259    fn test_negative() {
260        assert_eq!(parse("minus sixty"), Some("-60".to_string()));
261        assert_eq!(
262            parse("minus twenty five thousand thirty seven"),
263            Some("-25037".to_string())
264        );
265    }
266
267    #[test]
268    fn test_zero() {
269        assert_eq!(parse("zero"), Some("zero".to_string()));
270    }
271
272    #[test]
273    fn test_invalid() {
274        assert_eq!(parse("hello"), None);
275        assert_eq!(parse("one hello"), None);
276    }
277}