partial_date/
word_numbers.rs

1//! Word-number recognition and substitution.
2//!
3//! This module converts English written numbers (e.g. `"twenty-three"`,
4//! `"one thousand nine hundred eighty-four"`) into their digit equivalents,
5//! allowing the tokeniser to treat them the same as numerals.
6//!
7//! # Approach
8//!
9//! The public entry point is [`replace_word_numbers`], which scans an
10//! utterance for the longest contiguous word-number span it can parse and
11//! replaces it with the decimal representation.  Multiple non-overlapping
12//! spans are replaced left-to-right.
13//!
14//! Each individual word is fuzzy-matched against the canonical English
15//! number vocabulary using [`crate::levenshtein::levenshtein_ratio`].  This
16//! lets the module tolerate common typos, repeated characters, transpositions,
17//! and phonetic spelling patterns from non-English speakers (see test suite).
18//!
19//! Ordinal forms (`"first"`, `"twenty-third"`, etc.) are included in the
20//! vocabulary so they parse identically to their cardinal equivalents.
21//!
22//! Common English stop words (`"the"`, `"of"`, `"and"`, etc.) are explicitly
23//! excluded so they cannot produce false-positive number matches.
24//!
25//! # Supported range
26//!
27//! 1 – 3000, covering every value that is meaningful as a day (1–31), month
28//! (1–12), or year (1–3000) in the date extraction context.
29//!
30//! # Grammar
31//!
32//! ```text
33//! number   ::= thousands? hundreds? tens_units
34//! thousands ::= unit "thousand"
35//! hundreds  ::= unit "hundred"
36//! tens_units ::= tens unit?   (e.g. "twenty", "twenty-one", "twenty-third")
37//!              | teen          (e.g. "fourteenth")
38//!              | unit          (e.g. "seventh")
39//!              | (empty)
40//! ```
41//!
42//! Hyphenated compound words (`"twenty-one"`) are split on `-` before
43//! individual word matching so the hyphen is treated as a separator.
44
45use crate::levenshtein::levenshtein_ratio;
46
47// ---------------------------------------------------------------------------
48// Fuzzy matching threshold
49// ---------------------------------------------------------------------------
50
51/// Minimum similarity ratio for a word to be accepted as a number word.
52///
53/// 0.65 is high enough to prevent cross-category false positives such as
54/// `"six"` → `"sixty"` (ratio 0.60) and `"three"` → `"thirteen"` (ratio
55/// 0.625), while still accepting the common English misspellings and typos
56/// exercised by this library's tests (e.g. `"theer"` → `"three"` at 0.80,
57/// `"sevne"` → `"seven"` at 0.86).
58///
59/// Non-English phonetic patterns (Swahili, Hausa, Zulu, etc.) often score
60/// below this threshold and are a known limitation — see the ignored
61/// `word_numbers_non_english` test module.
62const MATCH_THRESHOLD: f32 = 0.65;
63//TODO: Increase the threshold above. The examples above are not helpful, especially naayiti. That does not need to be transformed to eighty
64
65// ---------------------------------------------------------------------------
66// Stop-word blocklist
67// ---------------------------------------------------------------------------
68
69/// Common English words that must never be interpreted as number words,
70/// regardless of their fuzzy similarity to number vocabulary.
71///
72/// Without this list, short words like `"the"` (ratio 0.60 against `"three"`),
73/// `"on"` (ratio 0.67 against `"one"`), and `"or"` (ratio 0.67 against
74/// `"four"`) would produce false-positive number matches.
75static STOP_WORDS: &[&str] = &[
76    "the", "of", "on", "or", "in", "at", "to", "a", "an", "and", "as", "is", "it", "be", "do",
77    "so", "up", "by", "if", "no", "my", "we", "he", "me", "us", "am", "are", "was", "not", "but",
78    "day", "date", "year", "month", "time", "age",
79];
80
81/// Return `true` if `word` is a known stop word and should never be parsed as
82/// a number word.
83fn is_stop_word(word: &str) -> bool {
84    STOP_WORDS.contains(&word)
85}
86
87// ---------------------------------------------------------------------------
88// Vocabulary tables
89// ---------------------------------------------------------------------------
90
91/// Cardinal and ordinal spellings for the units 1–9.
92///
93/// Each entry is `(canonical_spelling, value)`.  Multiple entries with the
94/// same value allow both `"one"` and `"first"` to resolve to 1.
95static UNITS: &[(&str, i32)] = &[
96    // Cardinals
97    ("one", 1),
98    ("two", 2),
99    ("three", 3),
100    ("four", 4),
101    ("five", 5),
102    ("six", 6),
103    ("seven", 7),
104    ("eight", 8),
105    ("nine", 9),
106    // Ordinals
107    ("first", 1),
108    ("second", 2),
109    ("third", 3),
110    ("fourth", 4),
111    ("fifth", 5),
112    ("sixth", 6),
113    ("seventh", 7),
114    ("eighth", 8),
115    ("ninth", 9),
116];
117
118/// Cardinal and ordinal spellings for the teens 10–19.
119static TEENS: &[(&str, i32)] = &[
120    // Cardinals
121    ("ten", 10),
122    ("eleven", 11),
123    ("twelve", 12),
124    ("thirteen", 13),
125    ("fourteen", 14),
126    ("fifteen", 15),
127    ("sixteen", 16),
128    ("seventeen", 17),
129    ("eighteen", 18),
130    ("nineteen", 19),
131    // Ordinals
132    ("tenth", 10),
133    ("eleventh", 11),
134    ("twelfth", 12),
135    ("thirteenth", 13),
136    ("fourteenth", 14),
137    ("fifteenth", 15),
138    ("sixteenth", 16),
139    ("seventeenth", 17),
140    ("eighteenth", 18),
141    ("nineteenth", 19),
142];
143
144/// Cardinal and ordinal spellings for the tens 20–90.
145static TENS: &[(&str, i32)] = &[
146    // Cardinals
147    ("twenty", 20),
148    ("thirty", 30),
149    ("forty", 40),
150    ("fifty", 50),
151    ("sixty", 60),
152    ("seventy", 70),
153    ("eighty", 80),
154    ("ninety", 90),
155    // Ordinals
156    ("twentieth", 20),
157    ("thirtieth", 30),
158    ("fortieth", 40),
159    ("fiftieth", 50),
160    ("sixtieth", 60),
161    ("seventieth", 70),
162    ("eightieth", 80),
163    ("ninetieth", 90),
164];
165
166/// The word "hundred" (and ordinal "hundredth").
167static HUNDREDS: &[&str] = &["hundred", "hundredth"];
168/// The word "thousand" (and ordinal "thousandth").
169static THOUSANDS: &[&str] = &["thousand", "thousandth"];
170
171// ---------------------------------------------------------------------------
172// Internal word matching
173// ---------------------------------------------------------------------------
174
175/// Try to fuzzy-match `word` against every entry in `table`.
176///
177/// Returns the value of the best-matching entry if its similarity is at or
178/// above [`MATCH_THRESHOLD`], or `None` if nothing is close enough.
179/// Stop words are rejected before any table lookup.
180fn best_match(word: &str, table: &[(&str, i32)]) -> Option<i32> {
181    if is_stop_word(word) {
182        return None;
183    }
184    table
185        .iter()
186        .map(|&(canonical, value)| (levenshtein_ratio(word, canonical), value))
187        .filter(|&(ratio, _)| ratio >= MATCH_THRESHOLD)
188        .max_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal))
189        .map(|(_, value)| value)
190}
191
192/// Try to match `word` as a unit or its ordinal (1–9).
193fn match_unit(word: &str) -> Option<i32> {
194    best_match(word, UNITS)
195}
196
197/// Try to match `word` as a teen or its ordinal (10–19).
198fn match_teen(word: &str) -> Option<i32> {
199    best_match(word, TEENS)
200}
201
202/// Try to match `word` as a tens value or its ordinal (20–90).
203fn match_tens(word: &str) -> Option<i32> {
204    best_match(word, TENS)
205}
206
207/// The number category that a word was matched into.
208#[derive(Debug, Clone, Copy, PartialEq, Eq)]
209enum NumberCategory {
210    Unit,
211    Teen,
212    Tens,
213}
214
215/// Return the highest levenshtein ratio for `word` against any entry in
216/// `table` that meets [`MATCH_THRESHOLD`], or `0.0` if none qualify.
217fn best_ratio(word: &str, table: &[(&str, i32)]) -> f32 {
218    if is_stop_word(word) {
219        return 0.0;
220    }
221    table
222        .iter()
223        .map(|&(canonical, _)| levenshtein_ratio(word, canonical))
224        .filter(|&ratio| ratio >= MATCH_THRESHOLD)
225        .fold(0.0_f32, f32::max)
226}
227
228/// Match `word` against all three categories (units, teens, tens) and return
229/// the value from whichever category has the strictly highest best ratio.
230///
231/// On a tie the preference order is tens > teens > units, so that e.g.
232/// `"twenty"` (exact tens match) is never trumped by a unit ordinal that
233/// happens to tie.
234///
235/// This cross-category comparison prevents a word like `"seven"` from being
236/// misclassified as `70` because `"seventy"` (tens) scores 0.714 and fires
237/// before the exact unit match (1.0) under a fixed-order strategy.
238fn match_best_single_word(word: &str) -> Option<(i32, NumberCategory)> {
239    let unit_ratio = best_ratio(word, UNITS);
240    let teen_ratio = best_ratio(word, TEENS);
241    let tens_ratio = best_ratio(word, TENS);
242
243    if unit_ratio == 0.0 && teen_ratio == 0.0 && tens_ratio == 0.0 {
244        return None;
245    }
246
247    // Strictly highest ratio wins.  Tie-break: tens > teens > units.
248    if unit_ratio > teen_ratio && unit_ratio > tens_ratio {
249        match_unit(word).map(|value| (value, NumberCategory::Unit))
250    } else if teen_ratio > tens_ratio {
251        match_teen(word).map(|value| (value, NumberCategory::Teen))
252    } else {
253        match_tens(word).map(|value| (value, NumberCategory::Tens))
254    }
255}
256
257/// Try to match `word` as "hundred" (or "hundredth").
258fn match_hundred(word: &str) -> bool {
259    if is_stop_word(word) {
260        return false;
261    }
262    HUNDREDS
263        .iter()
264        .any(|&canonical| levenshtein_ratio(word, canonical) >= MATCH_THRESHOLD)
265}
266
267/// Try to match `word` as "thousand" (or "thousandth").
268fn match_thousand(word: &str) -> bool {
269    if is_stop_word(word) {
270        return false;
271    }
272    THOUSANDS
273        .iter()
274        .any(|&canonical| levenshtein_ratio(word, canonical) >= MATCH_THRESHOLD)
275}
276
277// ---------------------------------------------------------------------------
278// Tokenisation of an utterance into words
279// ---------------------------------------------------------------------------
280
281/// Split an utterance into a sequence of word tokens, treating spaces,
282/// hyphens, and whitespace as separators.  Each token carries its byte
283/// offset in the original string so we can reconstruct the replacement.
284fn word_tokens(utterance: &str) -> Vec<(usize, &str)> {
285    let mut tokens: Vec<(usize, &str)> = Vec::new();
286    let mut start: Option<usize> = None;
287
288    for (byte_offset, character) in utterance.char_indices() {
289        let is_separator = character == ' '
290            || character == '-'
291            || character == '\t'
292            || character == '\n'
293            || character == '\r';
294
295        if is_separator {
296            if let Some(word_start) = start.take() {
297                tokens.push((word_start, &utterance[word_start..byte_offset]));
298            }
299        } else if start.is_none() {
300            start = Some(byte_offset);
301        }
302    }
303    // Flush the last word.
304    if let Some(word_start) = start {
305        tokens.push((word_start, &utterance[word_start..]));
306    }
307
308    tokens
309}
310
311// ---------------------------------------------------------------------------
312// Greedy number span parser
313// ---------------------------------------------------------------------------
314
315/// Attempt to parse a number starting at `tokens[cursor]`.
316///
317/// Returns `(value, words_consumed)` if a number was parsed, or `None`.
318///
319/// Grammar (greedy, left-to-right):
320/// ```text
321/// number ::= thousands? hundreds? tens_units
322/// thousands ::= unit "thousand"
323/// hundreds  ::= unit "hundred"
324/// tens_units ::= (tens unit?) | teen | unit | ε
325/// ```
326fn try_parse_number(tokens: &[(usize, &str)], cursor: usize) -> Option<(i32, usize)> {
327    let lower_word = |index: usize| -> Option<String> {
328        tokens.get(index).map(|(_, word)| word.to_ascii_lowercase())
329    };
330
331    let mut position = cursor;
332    let mut total: i32 = 0;
333
334    // --- Thousands component ------------------------------------------------
335    // Pattern: <unit> "thousand"
336    if let Some(unit_word) = lower_word(position)
337        && let Some(unit_value) = match_unit(&unit_word)
338        && let Some(thousand_word) = lower_word(position + 1)
339        && match_thousand(&thousand_word)
340    {
341        total += unit_value * 1000;
342        position += 2;
343    }
344
345    // --- Hundreds component -------------------------------------------------
346    // Pattern: <unit> "hundred"
347    if let Some(unit_word) = lower_word(position)
348        && let Some(unit_value) = match_unit(&unit_word)
349        && let Some(hundred_word) = lower_word(position + 1)
350        && match_hundred(&hundred_word)
351    {
352        total += unit_value * 100;
353        position += 2;
354    }
355
356    // --- Tens-and-units component -------------------------------------------
357    //
358    // Use cross-category best-ratio selection so that a word like "seven" is
359    // never misclassified as 70 just because "seventy" (tens) clears the
360    // threshold before the exact unit match is checked.
361    if let Some(word) = lower_word(position) {
362        match match_best_single_word(&word) {
363            Some((value, NumberCategory::Tens)) => {
364                total += value;
365                position += 1;
366                // A tens word may optionally be followed by a unit
367                // (e.g. "twenty" + "one" → 21).
368                if let Some(unit_word) = lower_word(position)
369                    && let Some((unit_value, _)) = match_best_single_word(&unit_word)
370                    // Only accept a Unit or Teen here, not another Tens.
371                    && matches!(
372                        match_best_single_word(&unit_word),
373                        Some((_, NumberCategory::Unit | NumberCategory::Teen))
374                    )
375                {
376                    total += unit_value;
377                    position += 1;
378                }
379            }
380            Some((value, NumberCategory::Teen | NumberCategory::Unit)) => {
381                total += value;
382                position += 1;
383            }
384            None => {
385                // No match at this position — fine if thousands/hundreds
386                // already accumulated something.
387            }
388        }
389    }
390
391    let words_consumed = position - cursor;
392
393    // Require at least one word consumed and a positive total.
394    if words_consumed == 0 || total <= 0 {
395        return None;
396    }
397
398    Some((total, words_consumed))
399}
400
401// ---------------------------------------------------------------------------
402// Public API
403// ---------------------------------------------------------------------------
404
405/// Scan `utterance` for word-number spans and replace each with its decimal
406/// representation, returning the modified string.
407///
408/// Spans are found greedily left-to-right.  The longest parseable span
409/// starting at each position is consumed; overlapping spans are not
410/// considered.  Words that do not participate in a recognised number span
411/// are left in place unchanged (including month names, noise words, etc.).
412///
413/// Ordinal forms (`"first"`, `"twenty-third"`, etc.) are treated identically
414/// to their cardinal equivalents (`"one"`, `"twenty-three"`).
415///
416/// # Examples
417///
418/// ```
419/// use partial_date::word_numbers::replace_word_numbers;
420///
421/// assert_eq!(replace_word_numbers("twenty-three"), "23");
422/// assert_eq!(replace_word_numbers("the twenty-third day"), "the 23 day");
423/// assert_eq!(replace_word_numbers("two thousand twenty-four"), "2024");
424/// assert_eq!(replace_word_numbers("31 December two thousand fourteen"), "31 December 2014");
425/// ```
426pub fn replace_word_numbers(utterance: &str) -> String {
427    let tokens = word_tokens(utterance);
428
429    if tokens.is_empty() {
430        return utterance.to_string();
431    }
432
433    let mut result = String::with_capacity(utterance.len());
434    // Byte offset up to which we have already written into `result`.
435    let mut output_up_to: usize = 0;
436    let mut token_cursor: usize = 0;
437
438    while token_cursor < tokens.len() {
439        match try_parse_number(&tokens, token_cursor) {
440            Some((value, words_consumed)) => {
441                // Write the original utterance bytes that precede this span
442                // (any separators / non-number content between the last output
443                // position and the start of the first consumed word).
444                let span_start = tokens[token_cursor].0;
445                if span_start > output_up_to {
446                    result.push_str(&utterance[output_up_to..span_start]);
447                }
448
449                // Write the digit string.
450                result.push_str(&value.to_string());
451
452                // Advance output_up_to past the last consumed word.
453                let last_consumed_index = token_cursor + words_consumed - 1;
454                let (last_word_start, last_word) = tokens[last_consumed_index];
455                output_up_to = last_word_start + last_word.len();
456
457                token_cursor += words_consumed;
458            }
459            None => {
460                // This word is not part of a number span — advance past it.
461                token_cursor += 1;
462            }
463        }
464    }
465
466    // Flush any remaining original bytes after the last replacement.
467    if output_up_to < utterance.len() {
468        result.push_str(&utterance[output_up_to..]);
469    }
470
471    result
472}
partial_date/word_numbers.rs

partial_date/
word_numbers.rs