partial_date/word_numbers.rs
1//! Word-number recognition and substitution.
2//!
3//! This module converts English written numbers (e.g. `"twenty-three"`,
4//! `"one thousand nine hundred eighty-four"`) into their digit equivalents,
5//! allowing the tokeniser to treat them the same as numerals.
6//!
7//! # Approach
8//!
9//! The public entry point is [`replace_word_numbers`], which scans an
10//! utterance for the longest contiguous word-number span it can parse and
11//! replaces it with the decimal representation. Multiple non-overlapping
12//! spans are replaced left-to-right.
13//!
14//! Each individual word is fuzzy-matched against the canonical English
15//! number vocabulary using [`crate::levenshtein::levenshtein_ratio`]. This
16//! lets the module tolerate common typos, repeated characters, transpositions,
17//! and phonetic spelling patterns from non-English speakers (see test suite).
18//!
19//! Ordinal forms (`"first"`, `"twenty-third"`, etc.) are included in the
20//! vocabulary so they parse identically to their cardinal equivalents.
21//!
22//! Common English stop words (`"the"`, `"of"`, `"and"`, etc.) are explicitly
23//! excluded so they cannot produce false-positive number matches.
24//!
25//! # Supported range
26//!
27//! 1 – 3000, covering every value that is meaningful as a day (1–31), month
28//! (1–12), or year (1–3000) in the date extraction context.
29//!
30//! # Grammar
31//!
32//! ```text
33//! number ::= thousands? hundreds? tens_units
34//! thousands ::= unit "thousand"
35//! hundreds ::= unit "hundred"
36//! tens_units ::= tens unit? (e.g. "twenty", "twenty-one", "twenty-third")
37//! | teen (e.g. "fourteenth")
38//! | unit (e.g. "seventh")
39//! | (empty)
40//! ```
41//!
42//! Hyphenated compound words (`"twenty-one"`) are split on `-` before
43//! individual word matching so the hyphen is treated as a separator.
44
45use crate::levenshtein::levenshtein_ratio;
46
47// ---------------------------------------------------------------------------
48// Fuzzy matching threshold
49// ---------------------------------------------------------------------------
50
51/// Minimum similarity ratio for a word to be accepted as a number word.
52///
53/// 0.65 is high enough to prevent cross-category false positives such as
54/// `"six"` → `"sixty"` (ratio 0.60) and `"three"` → `"thirteen"` (ratio
55/// 0.625), while still accepting the common English misspellings and typos
56/// exercised by this library's tests (e.g. `"theer"` → `"three"` at 0.80,
57/// `"sevne"` → `"seven"` at 0.86).
58///
59/// Non-English phonetic patterns (Swahili, Hausa, Zulu, etc.) often score
60/// below this threshold and are a known limitation — see the ignored
61/// `word_numbers_non_english` test module.
62const MATCH_THRESHOLD: f32 = 0.65;
63//TODO: Increase the threshold above. The examples above are not helpful, especially naayiti. That does not need to be transformed to eighty
64
65// ---------------------------------------------------------------------------
66// Stop-word blocklist
67// ---------------------------------------------------------------------------
68
69/// Common English words that must never be interpreted as number words,
70/// regardless of their fuzzy similarity to number vocabulary.
71///
72/// Without this list, short words like `"the"` (ratio 0.60 against `"three"`),
73/// `"on"` (ratio 0.67 against `"one"`), and `"or"` (ratio 0.67 against
74/// `"four"`) would produce false-positive number matches.
75static STOP_WORDS: &[&str] = &[
76 "the", "of", "on", "or", "in", "at", "to", "a", "an", "and", "as", "is", "it", "be", "do",
77 "so", "up", "by", "if", "no", "my", "we", "he", "me", "us", "am", "are", "was", "not", "but",
78 "day", "date", "year", "month", "time", "age",
79];
80
81/// Return `true` if `word` is a known stop word and should never be parsed as
82/// a number word.
83fn is_stop_word(word: &str) -> bool {
84 STOP_WORDS.contains(&word)
85}
86
87// ---------------------------------------------------------------------------
88// Vocabulary tables
89// ---------------------------------------------------------------------------
90
91/// Cardinal and ordinal spellings for the units 1–9.
92///
93/// Each entry is `(canonical_spelling, value)`. Multiple entries with the
94/// same value allow both `"one"` and `"first"` to resolve to 1.
95static UNITS: &[(&str, i32)] = &[
96 // Cardinals
97 ("one", 1),
98 ("two", 2),
99 ("three", 3),
100 ("four", 4),
101 ("five", 5),
102 ("six", 6),
103 ("seven", 7),
104 ("eight", 8),
105 ("nine", 9),
106 // Ordinals
107 ("first", 1),
108 ("second", 2),
109 ("third", 3),
110 ("fourth", 4),
111 ("fifth", 5),
112 ("sixth", 6),
113 ("seventh", 7),
114 ("eighth", 8),
115 ("ninth", 9),
116];
117
118/// Cardinal and ordinal spellings for the teens 10–19.
119static TEENS: &[(&str, i32)] = &[
120 // Cardinals
121 ("ten", 10),
122 ("eleven", 11),
123 ("twelve", 12),
124 ("thirteen", 13),
125 ("fourteen", 14),
126 ("fifteen", 15),
127 ("sixteen", 16),
128 ("seventeen", 17),
129 ("eighteen", 18),
130 ("nineteen", 19),
131 // Ordinals
132 ("tenth", 10),
133 ("eleventh", 11),
134 ("twelfth", 12),
135 ("thirteenth", 13),
136 ("fourteenth", 14),
137 ("fifteenth", 15),
138 ("sixteenth", 16),
139 ("seventeenth", 17),
140 ("eighteenth", 18),
141 ("nineteenth", 19),
142];
143
144/// Cardinal and ordinal spellings for the tens 20–90.
145static TENS: &[(&str, i32)] = &[
146 // Cardinals
147 ("twenty", 20),
148 ("thirty", 30),
149 ("forty", 40),
150 ("fifty", 50),
151 ("sixty", 60),
152 ("seventy", 70),
153 ("eighty", 80),
154 ("ninety", 90),
155 // Ordinals
156 ("twentieth", 20),
157 ("thirtieth", 30),
158 ("fortieth", 40),
159 ("fiftieth", 50),
160 ("sixtieth", 60),
161 ("seventieth", 70),
162 ("eightieth", 80),
163 ("ninetieth", 90),
164];
165
166/// The word "hundred" (and ordinal "hundredth").
167static HUNDREDS: &[&str] = &["hundred", "hundredth"];
168/// The word "thousand" (and ordinal "thousandth").
169static THOUSANDS: &[&str] = &["thousand", "thousandth"];
170
171// ---------------------------------------------------------------------------
172// Internal word matching
173// ---------------------------------------------------------------------------
174
175/// Try to fuzzy-match `word` against every entry in `table`.
176///
177/// Returns the value of the best-matching entry if its similarity is at or
178/// above [`MATCH_THRESHOLD`], or `None` if nothing is close enough.
179/// Stop words are rejected before any table lookup.
180fn best_match(word: &str, table: &[(&str, i32)]) -> Option<i32> {
181 if is_stop_word(word) {
182 return None;
183 }
184 table
185 .iter()
186 .map(|&(canonical, value)| (levenshtein_ratio(word, canonical), value))
187 .filter(|&(ratio, _)| ratio >= MATCH_THRESHOLD)
188 .max_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal))
189 .map(|(_, value)| value)
190}
191
192/// Try to match `word` as a unit or its ordinal (1–9).
193fn match_unit(word: &str) -> Option<i32> {
194 best_match(word, UNITS)
195}
196
197/// Try to match `word` as a teen or its ordinal (10–19).
198fn match_teen(word: &str) -> Option<i32> {
199 best_match(word, TEENS)
200}
201
202/// Try to match `word` as a tens value or its ordinal (20–90).
203fn match_tens(word: &str) -> Option<i32> {
204 best_match(word, TENS)
205}
206
207/// The number category that a word was matched into.
208#[derive(Debug, Clone, Copy, PartialEq, Eq)]
209enum NumberCategory {
210 Unit,
211 Teen,
212 Tens,
213}
214
215/// Return the highest levenshtein ratio for `word` against any entry in
216/// `table` that meets [`MATCH_THRESHOLD`], or `0.0` if none qualify.
217fn best_ratio(word: &str, table: &[(&str, i32)]) -> f32 {
218 if is_stop_word(word) {
219 return 0.0;
220 }
221 table
222 .iter()
223 .map(|&(canonical, _)| levenshtein_ratio(word, canonical))
224 .filter(|&ratio| ratio >= MATCH_THRESHOLD)
225 .fold(0.0_f32, f32::max)
226}
227
228/// Match `word` against all three categories (units, teens, tens) and return
229/// the value from whichever category has the strictly highest best ratio.
230///
231/// On a tie the preference order is tens > teens > units, so that e.g.
232/// `"twenty"` (exact tens match) is never trumped by a unit ordinal that
233/// happens to tie.
234///
235/// This cross-category comparison prevents a word like `"seven"` from being
236/// misclassified as `70` because `"seventy"` (tens) scores 0.714 and fires
237/// before the exact unit match (1.0) under a fixed-order strategy.
238fn match_best_single_word(word: &str) -> Option<(i32, NumberCategory)> {
239 let unit_ratio = best_ratio(word, UNITS);
240 let teen_ratio = best_ratio(word, TEENS);
241 let tens_ratio = best_ratio(word, TENS);
242
243 if unit_ratio == 0.0 && teen_ratio == 0.0 && tens_ratio == 0.0 {
244 return None;
245 }
246
247 // Strictly highest ratio wins. Tie-break: tens > teens > units.
248 if unit_ratio > teen_ratio && unit_ratio > tens_ratio {
249 match_unit(word).map(|value| (value, NumberCategory::Unit))
250 } else if teen_ratio > tens_ratio {
251 match_teen(word).map(|value| (value, NumberCategory::Teen))
252 } else {
253 match_tens(word).map(|value| (value, NumberCategory::Tens))
254 }
255}
256
257/// Try to match `word` as "hundred" (or "hundredth").
258fn match_hundred(word: &str) -> bool {
259 if is_stop_word(word) {
260 return false;
261 }
262 HUNDREDS
263 .iter()
264 .any(|&canonical| levenshtein_ratio(word, canonical) >= MATCH_THRESHOLD)
265}
266
267/// Try to match `word` as "thousand" (or "thousandth").
268fn match_thousand(word: &str) -> bool {
269 if is_stop_word(word) {
270 return false;
271 }
272 THOUSANDS
273 .iter()
274 .any(|&canonical| levenshtein_ratio(word, canonical) >= MATCH_THRESHOLD)
275}
276
277// ---------------------------------------------------------------------------
278// Tokenisation of an utterance into words
279// ---------------------------------------------------------------------------
280
281/// Split an utterance into a sequence of word tokens, treating spaces,
282/// hyphens, and whitespace as separators. Each token carries its byte
283/// offset in the original string so we can reconstruct the replacement.
284fn word_tokens(utterance: &str) -> Vec<(usize, &str)> {
285 let mut tokens: Vec<(usize, &str)> = Vec::new();
286 let mut start: Option<usize> = None;
287
288 for (byte_offset, character) in utterance.char_indices() {
289 let is_separator = character == ' '
290 || character == '-'
291 || character == '\t'
292 || character == '\n'
293 || character == '\r';
294
295 if is_separator {
296 if let Some(word_start) = start.take() {
297 tokens.push((word_start, &utterance[word_start..byte_offset]));
298 }
299 } else if start.is_none() {
300 start = Some(byte_offset);
301 }
302 }
303 // Flush the last word.
304 if let Some(word_start) = start {
305 tokens.push((word_start, &utterance[word_start..]));
306 }
307
308 tokens
309}
310
311// ---------------------------------------------------------------------------
312// Greedy number span parser
313// ---------------------------------------------------------------------------
314
315/// Attempt to parse a number starting at `tokens[cursor]`.
316///
317/// Returns `(value, words_consumed)` if a number was parsed, or `None`.
318///
319/// Grammar (greedy, left-to-right):
320/// ```text
321/// number ::= thousands? hundreds? tens_units
322/// thousands ::= unit "thousand"
323/// hundreds ::= unit "hundred"
324/// tens_units ::= (tens unit?) | teen | unit | ε
325/// ```
326fn try_parse_number(tokens: &[(usize, &str)], cursor: usize) -> Option<(i32, usize)> {
327 let lower_word = |index: usize| -> Option<String> {
328 tokens.get(index).map(|(_, word)| word.to_ascii_lowercase())
329 };
330
331 let mut position = cursor;
332 let mut total: i32 = 0;
333
334 // --- Thousands component ------------------------------------------------
335 // Pattern: <unit> "thousand"
336 if let Some(unit_word) = lower_word(position)
337 && let Some(unit_value) = match_unit(&unit_word)
338 && let Some(thousand_word) = lower_word(position + 1)
339 && match_thousand(&thousand_word)
340 {
341 total += unit_value * 1000;
342 position += 2;
343 }
344
345 // --- Hundreds component -------------------------------------------------
346 // Pattern: <unit> "hundred"
347 if let Some(unit_word) = lower_word(position)
348 && let Some(unit_value) = match_unit(&unit_word)
349 && let Some(hundred_word) = lower_word(position + 1)
350 && match_hundred(&hundred_word)
351 {
352 total += unit_value * 100;
353 position += 2;
354 }
355
356 // --- Tens-and-units component -------------------------------------------
357 //
358 // Use cross-category best-ratio selection so that a word like "seven" is
359 // never misclassified as 70 just because "seventy" (tens) clears the
360 // threshold before the exact unit match is checked.
361 if let Some(word) = lower_word(position) {
362 match match_best_single_word(&word) {
363 Some((value, NumberCategory::Tens)) => {
364 total += value;
365 position += 1;
366 // A tens word may optionally be followed by a unit
367 // (e.g. "twenty" + "one" → 21).
368 if let Some(unit_word) = lower_word(position)
369 && let Some((unit_value, _)) = match_best_single_word(&unit_word)
370 // Only accept a Unit or Teen here, not another Tens.
371 && matches!(
372 match_best_single_word(&unit_word),
373 Some((_, NumberCategory::Unit | NumberCategory::Teen))
374 )
375 {
376 total += unit_value;
377 position += 1;
378 }
379 }
380 Some((value, NumberCategory::Teen | NumberCategory::Unit)) => {
381 total += value;
382 position += 1;
383 }
384 None => {
385 // No match at this position — fine if thousands/hundreds
386 // already accumulated something.
387 }
388 }
389 }
390
391 let words_consumed = position - cursor;
392
393 // Require at least one word consumed and a positive total.
394 if words_consumed == 0 || total <= 0 {
395 return None;
396 }
397
398 Some((total, words_consumed))
399}
400
401// ---------------------------------------------------------------------------
402// Public API
403// ---------------------------------------------------------------------------
404
405/// Scan `utterance` for word-number spans and replace each with its decimal
406/// representation, returning the modified string.
407///
408/// Spans are found greedily left-to-right. The longest parseable span
409/// starting at each position is consumed; overlapping spans are not
410/// considered. Words that do not participate in a recognised number span
411/// are left in place unchanged (including month names, noise words, etc.).
412///
413/// Ordinal forms (`"first"`, `"twenty-third"`, etc.) are treated identically
414/// to their cardinal equivalents (`"one"`, `"twenty-three"`).
415///
416/// # Examples
417///
418/// ```
419/// use partial_date::word_numbers::replace_word_numbers;
420///
421/// assert_eq!(replace_word_numbers("twenty-three"), "23");
422/// assert_eq!(replace_word_numbers("the twenty-third day"), "the 23 day");
423/// assert_eq!(replace_word_numbers("two thousand twenty-four"), "2024");
424/// assert_eq!(replace_word_numbers("31 December two thousand fourteen"), "31 December 2014");
425/// ```
426pub fn replace_word_numbers(utterance: &str) -> String {
427 let tokens = word_tokens(utterance);
428
429 if tokens.is_empty() {
430 return utterance.to_string();
431 }
432
433 let mut result = String::with_capacity(utterance.len());
434 // Byte offset up to which we have already written into `result`.
435 let mut output_up_to: usize = 0;
436 let mut token_cursor: usize = 0;
437
438 while token_cursor < tokens.len() {
439 match try_parse_number(&tokens, token_cursor) {
440 Some((value, words_consumed)) => {
441 // Write the original utterance bytes that precede this span
442 // (any separators / non-number content between the last output
443 // position and the start of the first consumed word).
444 let span_start = tokens[token_cursor].0;
445 if span_start > output_up_to {
446 result.push_str(&utterance[output_up_to..span_start]);
447 }
448
449 // Write the digit string.
450 result.push_str(&value.to_string());
451
452 // Advance output_up_to past the last consumed word.
453 let last_consumed_index = token_cursor + words_consumed - 1;
454 let (last_word_start, last_word) = tokens[last_consumed_index];
455 output_up_to = last_word_start + last_word.len();
456
457 token_cursor += words_consumed;
458 }
459 None => {
460 // This word is not part of a number span — advance past it.
461 token_cursor += 1;
462 }
463 }
464 }
465
466 // Flush any remaining original bytes after the last replacement.
467 if output_up_to < utterance.len() {
468 result.push_str(&utterance[output_up_to..]);
469 }
470
471 result
472}