partial-date 0.1.1

Deterministic partial date extraction from natural language text, with sensible defaults and extensive configurability.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
//! Word-number recognition and substitution.
//!
//! This module converts English written numbers (e.g. `"twenty-three"`,
//! `"one thousand nine hundred eighty-four"`) into their digit equivalents,
//! allowing the tokeniser to treat them the same as numerals.
//!
//! # Approach
//!
//! The public entry point is [`replace_word_numbers`], which scans an
//! utterance for the longest contiguous word-number span it can parse and
//! replaces it with the decimal representation.  Multiple non-overlapping
//! spans are replaced left-to-right.
//!
//! Each individual word is fuzzy-matched against the canonical English
//! number vocabulary using [`crate::levenshtein::levenshtein_ratio`].  This
//! lets the module tolerate common typos, repeated characters, transpositions,
//! and phonetic spelling patterns from non-English speakers (see test suite).
//!
//! Ordinal forms (`"first"`, `"twenty-third"`, etc.) are included in the
//! vocabulary so they parse identically to their cardinal equivalents.
//!
//! Common English stop words (`"the"`, `"of"`, `"and"`, etc.) are explicitly
//! excluded so they cannot produce false-positive number matches.
//!
//! # Supported range
//!
//! 1 – 3000, covering every value that is meaningful as a day (1–31), month
//! (1–12), or year (1–3000) in the date extraction context.
//!
//! # Grammar
//!
//! ```text
//! number   ::= thousands? hundreds? tens_units
//! thousands ::= unit "thousand"
//! hundreds  ::= unit "hundred"
//! tens_units ::= tens unit?   (e.g. "twenty", "twenty-one", "twenty-third")
//!              | teen          (e.g. "fourteenth")
//!              | unit          (e.g. "seventh")
//!              | (empty)
//! ```
//!
//! Hyphenated compound words (`"twenty-one"`) are split on `-` before
//! individual word matching so the hyphen is treated as a separator.

use crate::levenshtein::levenshtein_ratio;

// ---------------------------------------------------------------------------
// Fuzzy matching threshold
// ---------------------------------------------------------------------------

/// Minimum similarity ratio for a word to be accepted as a number word.
///
/// 0.65 is high enough to prevent cross-category false positives such as
/// `"six"` → `"sixty"` (ratio 0.60) and `"three"` → `"thirteen"` (ratio
/// 0.625), while still accepting the common English misspellings and typos
/// exercised by this library's tests (e.g. `"theer"` → `"three"` at 0.80,
/// `"sevne"` → `"seven"` at 0.86).
///
/// Non-English phonetic patterns (Swahili, Hausa, Zulu, etc.) often score
/// below this threshold and are a known limitation — see the ignored
/// `word_numbers_non_english` test module.
const MATCH_THRESHOLD: f32 = 0.65;
//TODO: Increase the threshold above. The examples above are not helpful, especially naayiti. That does not need to be transformed to eighty

// ---------------------------------------------------------------------------
// Stop-word blocklist
// ---------------------------------------------------------------------------

/// Common English words that must never be interpreted as number words,
/// regardless of their fuzzy similarity to number vocabulary.
///
/// Without this list, short words like `"the"` (ratio 0.60 against `"three"`),
/// `"on"` (ratio 0.67 against `"one"`), and `"or"` (ratio 0.67 against
/// `"four"`) would produce false-positive number matches.
static STOP_WORDS: &[&str] = &[
    "the", "of", "on", "or", "in", "at", "to", "a", "an", "and", "as", "is", "it", "be", "do",
    "so", "up", "by", "if", "no", "my", "we", "he", "me", "us", "am", "are", "was", "not", "but",
    "day", "date", "year", "month", "time", "age",
];

/// Return `true` if `word` is a known stop word and should never be parsed as
/// a number word.
fn is_stop_word(word: &str) -> bool {
    STOP_WORDS.contains(&word)
}

// ---------------------------------------------------------------------------
// Vocabulary tables
// ---------------------------------------------------------------------------

/// Cardinal and ordinal spellings for the units 1–9.
///
/// Each entry is `(canonical_spelling, value)`.  Multiple entries with the
/// same value allow both `"one"` and `"first"` to resolve to 1.
static UNITS: &[(&str, i32)] = &[
    // Cardinals
    ("one", 1),
    ("two", 2),
    ("three", 3),
    ("four", 4),
    ("five", 5),
    ("six", 6),
    ("seven", 7),
    ("eight", 8),
    ("nine", 9),
    // Ordinals
    ("first", 1),
    ("second", 2),
    ("third", 3),
    ("fourth", 4),
    ("fifth", 5),
    ("sixth", 6),
    ("seventh", 7),
    ("eighth", 8),
    ("ninth", 9),
];

/// Cardinal and ordinal spellings for the teens 10–19.
static TEENS: &[(&str, i32)] = &[
    // Cardinals
    ("ten", 10),
    ("eleven", 11),
    ("twelve", 12),
    ("thirteen", 13),
    ("fourteen", 14),
    ("fifteen", 15),
    ("sixteen", 16),
    ("seventeen", 17),
    ("eighteen", 18),
    ("nineteen", 19),
    // Ordinals
    ("tenth", 10),
    ("eleventh", 11),
    ("twelfth", 12),
    ("thirteenth", 13),
    ("fourteenth", 14),
    ("fifteenth", 15),
    ("sixteenth", 16),
    ("seventeenth", 17),
    ("eighteenth", 18),
    ("nineteenth", 19),
];

/// Cardinal and ordinal spellings for the tens 20–90.
static TENS: &[(&str, i32)] = &[
    // Cardinals
    ("twenty", 20),
    ("thirty", 30),
    ("forty", 40),
    ("fifty", 50),
    ("sixty", 60),
    ("seventy", 70),
    ("eighty", 80),
    ("ninety", 90),
    // Ordinals
    ("twentieth", 20),
    ("thirtieth", 30),
    ("fortieth", 40),
    ("fiftieth", 50),
    ("sixtieth", 60),
    ("seventieth", 70),
    ("eightieth", 80),
    ("ninetieth", 90),
];

/// The word "hundred" (and ordinal "hundredth").
static HUNDREDS: &[&str] = &["hundred", "hundredth"];
/// The word "thousand" (and ordinal "thousandth").
static THOUSANDS: &[&str] = &["thousand", "thousandth"];

// ---------------------------------------------------------------------------
// Internal word matching
// ---------------------------------------------------------------------------

/// Try to fuzzy-match `word` against every entry in `table`.
///
/// Returns the value of the best-matching entry if its similarity is at or
/// above [`MATCH_THRESHOLD`], or `None` if nothing is close enough.
/// Stop words are rejected before any table lookup.
fn best_match(word: &str, table: &[(&str, i32)]) -> Option<i32> {
    if is_stop_word(word) {
        return None;
    }
    table
        .iter()
        .map(|&(canonical, value)| (levenshtein_ratio(word, canonical), value))
        .filter(|&(ratio, _)| ratio >= MATCH_THRESHOLD)
        .max_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal))
        .map(|(_, value)| value)
}

/// Try to match `word` as a unit or its ordinal (1–9).
fn match_unit(word: &str) -> Option<i32> {
    best_match(word, UNITS)
}

/// Try to match `word` as a teen or its ordinal (10–19).
fn match_teen(word: &str) -> Option<i32> {
    best_match(word, TEENS)
}

/// Try to match `word` as a tens value or its ordinal (20–90).
fn match_tens(word: &str) -> Option<i32> {
    best_match(word, TENS)
}

/// The number category that a word was matched into.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum NumberCategory {
    Unit,
    Teen,
    Tens,
}

/// Return the highest levenshtein ratio for `word` against any entry in
/// `table` that meets [`MATCH_THRESHOLD`], or `0.0` if none qualify.
fn best_ratio(word: &str, table: &[(&str, i32)]) -> f32 {
    if is_stop_word(word) {
        return 0.0;
    }
    table
        .iter()
        .map(|&(canonical, _)| levenshtein_ratio(word, canonical))
        .filter(|&ratio| ratio >= MATCH_THRESHOLD)
        .fold(0.0_f32, f32::max)
}

/// Match `word` against all three categories (units, teens, tens) and return
/// the value from whichever category has the strictly highest best ratio.
///
/// On a tie the preference order is tens > teens > units, so that e.g.
/// `"twenty"` (exact tens match) is never trumped by a unit ordinal that
/// happens to tie.
///
/// This cross-category comparison prevents a word like `"seven"` from being
/// misclassified as `70` because `"seventy"` (tens) scores 0.714 and fires
/// before the exact unit match (1.0) under a fixed-order strategy.
fn match_best_single_word(word: &str) -> Option<(i32, NumberCategory)> {
    let unit_ratio = best_ratio(word, UNITS);
    let teen_ratio = best_ratio(word, TEENS);
    let tens_ratio = best_ratio(word, TENS);

    if unit_ratio == 0.0 && teen_ratio == 0.0 && tens_ratio == 0.0 {
        return None;
    }

    // Strictly highest ratio wins.  Tie-break: tens > teens > units.
    if unit_ratio > teen_ratio && unit_ratio > tens_ratio {
        match_unit(word).map(|value| (value, NumberCategory::Unit))
    } else if teen_ratio > tens_ratio {
        match_teen(word).map(|value| (value, NumberCategory::Teen))
    } else {
        match_tens(word).map(|value| (value, NumberCategory::Tens))
    }
}

/// Try to match `word` as "hundred" (or "hundredth").
fn match_hundred(word: &str) -> bool {
    if is_stop_word(word) {
        return false;
    }
    HUNDREDS
        .iter()
        .any(|&canonical| levenshtein_ratio(word, canonical) >= MATCH_THRESHOLD)
}

/// Try to match `word` as "thousand" (or "thousandth").
fn match_thousand(word: &str) -> bool {
    if is_stop_word(word) {
        return false;
    }
    THOUSANDS
        .iter()
        .any(|&canonical| levenshtein_ratio(word, canonical) >= MATCH_THRESHOLD)
}

// ---------------------------------------------------------------------------
// Tokenisation of an utterance into words
// ---------------------------------------------------------------------------

/// Split an utterance into a sequence of word tokens, treating spaces,
/// hyphens, and whitespace as separators.  Each token carries its byte
/// offset in the original string so we can reconstruct the replacement.
fn word_tokens(utterance: &str) -> Vec<(usize, &str)> {
    let mut tokens: Vec<(usize, &str)> = Vec::new();
    let mut start: Option<usize> = None;

    for (byte_offset, character) in utterance.char_indices() {
        let is_separator = character == ' '
            || character == '-'
            || character == '\t'
            || character == '\n'
            || character == '\r';

        if is_separator {
            if let Some(word_start) = start.take() {
                tokens.push((word_start, &utterance[word_start..byte_offset]));
            }
        } else if start.is_none() {
            start = Some(byte_offset);
        }
    }
    // Flush the last word.
    if let Some(word_start) = start {
        tokens.push((word_start, &utterance[word_start..]));
    }

    tokens
}

// ---------------------------------------------------------------------------
// Greedy number span parser
// ---------------------------------------------------------------------------

/// Attempt to parse a number starting at `tokens[cursor]`.
///
/// Returns `(value, words_consumed)` if a number was parsed, or `None`.
///
/// Grammar (greedy, left-to-right):
/// ```text
/// number ::= thousands? hundreds? tens_units
/// thousands ::= unit "thousand"
/// hundreds  ::= unit "hundred"
/// tens_units ::= (tens unit?) | teen | unit | ε
/// ```
fn try_parse_number(tokens: &[(usize, &str)], cursor: usize) -> Option<(i32, usize)> {
    let lower_word = |index: usize| -> Option<String> {
        tokens.get(index).map(|(_, word)| word.to_ascii_lowercase())
    };

    let mut position = cursor;
    let mut total: i32 = 0;

    // --- Thousands component ------------------------------------------------
    // Pattern: <unit> "thousand"
    if let Some(unit_word) = lower_word(position)
        && let Some(unit_value) = match_unit(&unit_word)
        && let Some(thousand_word) = lower_word(position + 1)
        && match_thousand(&thousand_word)
    {
        total += unit_value * 1000;
        position += 2;
    }

    // --- Hundreds component -------------------------------------------------
    // Pattern: <unit> "hundred"
    if let Some(unit_word) = lower_word(position)
        && let Some(unit_value) = match_unit(&unit_word)
        && let Some(hundred_word) = lower_word(position + 1)
        && match_hundred(&hundred_word)
    {
        total += unit_value * 100;
        position += 2;
    }

    // --- Tens-and-units component -------------------------------------------
    //
    // Use cross-category best-ratio selection so that a word like "seven" is
    // never misclassified as 70 just because "seventy" (tens) clears the
    // threshold before the exact unit match is checked.
    if let Some(word) = lower_word(position) {
        match match_best_single_word(&word) {
            Some((value, NumberCategory::Tens)) => {
                total += value;
                position += 1;
                // A tens word may optionally be followed by a unit
                // (e.g. "twenty" + "one" → 21).
                if let Some(unit_word) = lower_word(position)
                    && let Some((unit_value, _)) = match_best_single_word(&unit_word)
                    // Only accept a Unit or Teen here, not another Tens.
                    && matches!(
                        match_best_single_word(&unit_word),
                        Some((_, NumberCategory::Unit | NumberCategory::Teen))
                    )
                {
                    total += unit_value;
                    position += 1;
                }
            }
            Some((value, NumberCategory::Teen | NumberCategory::Unit)) => {
                total += value;
                position += 1;
            }
            None => {
                // No match at this position — fine if thousands/hundreds
                // already accumulated something.
            }
        }
    }

    let words_consumed = position - cursor;

    // Require at least one word consumed and a positive total.
    if words_consumed == 0 || total <= 0 {
        return None;
    }

    Some((total, words_consumed))
}

// ---------------------------------------------------------------------------
// Public API
// ---------------------------------------------------------------------------

/// Scan `utterance` for word-number spans and replace each with its decimal
/// representation, returning the modified string.
///
/// Spans are found greedily left-to-right.  The longest parseable span
/// starting at each position is consumed; overlapping spans are not
/// considered.  Words that do not participate in a recognised number span
/// are left in place unchanged (including month names, noise words, etc.).
///
/// Ordinal forms (`"first"`, `"twenty-third"`, etc.) are treated identically
/// to their cardinal equivalents (`"one"`, `"twenty-three"`).
///
/// # Examples
///
/// ```
/// use partial_date::word_numbers::replace_word_numbers;
///
/// assert_eq!(replace_word_numbers("twenty-three"), "23");
/// assert_eq!(replace_word_numbers("the twenty-third day"), "the 23 day");
/// assert_eq!(replace_word_numbers("two thousand twenty-four"), "2024");
/// assert_eq!(replace_word_numbers("31 December two thousand fourteen"), "31 December 2014");
/// ```
pub fn replace_word_numbers(utterance: &str) -> String {
    let tokens = word_tokens(utterance);

    if tokens.is_empty() {
        return utterance.to_string();
    }

    let mut result = String::with_capacity(utterance.len());
    // Byte offset up to which we have already written into `result`.
    let mut output_up_to: usize = 0;
    let mut token_cursor: usize = 0;

    while token_cursor < tokens.len() {
        match try_parse_number(&tokens, token_cursor) {
            Some((value, words_consumed)) => {
                // Write the original utterance bytes that precede this span
                // (any separators / non-number content between the last output
                // position and the start of the first consumed word).
                let span_start = tokens[token_cursor].0;
                if span_start > output_up_to {
                    result.push_str(&utterance[output_up_to..span_start]);
                }

                // Write the digit string.
                result.push_str(&value.to_string());

                // Advance output_up_to past the last consumed word.
                let last_consumed_index = token_cursor + words_consumed - 1;
                let (last_word_start, last_word) = tokens[last_consumed_index];
                output_up_to = last_word_start + last_word.len();

                token_cursor += words_consumed;
            }
            None => {
                // This word is not part of a number span — advance past it.
                token_cursor += 1;
            }
        }
    }

    // Flush any remaining original bytes after the last replacement.
    if output_up_to < utterance.len() {
        result.push_str(&utterance[output_up_to..]);
    }

    result
}