partial_date/
tokeniser.rs

1//! Tokenisation: splitting and classifying utterance chunks into date tokens.
2
3use crate::models::{Config, DateComponent, MonthName, Token};
4use crate::word_numbers::replace_word_numbers;
5
6/// Split `utterance` on any standard separator or extra separator and classify
7/// each resulting chunk as a [`Token`].
8///
9/// # What counts as a separator
10///
11/// The standard separator set is: ASCII whitespace (space, tab, newline,
12/// carriage return), `/`, `-`, `.`, `,`, `\`. Any additional strings in
13/// `config.extra_separators` are also treated as separators.
14///
15/// When `config.no_separator` is `true` and the utterance is a pure digit
16/// string of length 6 or 8, it is sliced positionally according to
17/// `config.component_order` rather than split on separators.
18///
19/// # Classification
20///
21/// Each non-separator chunk is examined for digit-to-alpha (or alpha-to-digit)
22/// boundaries, allowing adjacent tokens like `"19october"` or `"August7"` to
23/// be split and classified independently.
24///
25/// - [`Token::OrdinalDay`] — digit run followed by `st`, `nd`, `rd`, or `th`.
26/// - [`Token::MonthName`] — full name, 3-letter abbreviation, unambiguous
27///   prefix, or fuzzy misspelling.
28/// - [`Token::Numeric`] — a run of ASCII digits; stores `(value, digit_count)`.
29/// - Anything else (noise words, stray punctuation) is silently discarded.
30///
31/// When [`Config::letter_o_substitution`] is `true` (the default), any token
32/// whose characters are all ASCII digits or the letter `O` (upper or lower
33/// case) is treated as a numeric token with every `O`/`o` replaced by `0`.
34/// This handles OCR and typing errors such as `"2O24"` → `2024`.  The
35/// substitution applies only to isolated tokens; a letter O that is part of a
36/// longer alphabetic run (e.g. `"october"`) is never affected because
37/// `sub_split_on_boundary` has already separated digit and alpha runs.
38///
39/// At most **three** tokens are returned.
40///
41/// # Examples
42///
43/// ```
44/// use partial_date::extract::tokenise;
45/// use partial_date::models::{Config, MonthName, Token};
46///
47/// assert_eq!(
48///     tokenise("19 October 2014", &Config::default()),
49///     vec![
50///         Token::Numeric(19, 2),
51///         Token::MonthName(MonthName::October),
52///         Token::Numeric(2014, 4),
53///     ]
54/// );
55///
56/// assert_eq!(
57///     tokenise("19th October,2015", &Config::default()),
58///     vec![
59///         Token::OrdinalDay(19),
60///         Token::MonthName(MonthName::October),
61///         Token::Numeric(2015, 4),
62///     ]
63/// );
64///
65/// assert_eq!(
66///     tokenise("19october", &Config::default()),
67///     vec![
68///         Token::Numeric(19, 2),
69///         Token::MonthName(MonthName::October),
70///     ]
71/// );
72///
73/// // Letter O substitution (enabled by default):
74/// assert_eq!(
75///     tokenise("2O24", &Config::default()),
76///     vec![Token::Numeric(2024, 4)]
77/// );
78///
79/// // "7october" — the O is part of "october", not a standalone token, so
80/// // substitution does not apply and the month name is recognised normally.
81/// assert_eq!(
82///     tokenise("7october", &Config::default()),
83///     vec![
84///         Token::Numeric(7, 1),
85///         Token::MonthName(MonthName::October),
86///     ]
87/// );
88/// ```
89pub fn tokenise(utterance: &str, config: &Config) -> Vec<Token> {
90    // Replace any word-number spans (e.g. "twenty-three") with their digit
91    // equivalents before any further processing.  This is done unconditionally
92    // so that "nineteen eighty-four" becomes "1984" and is then classified as a
93    // normal Numeric token.  The replacement is non-destructive for utterances
94    // that contain no word numbers.
95    let normalised = replace_word_numbers(utterance);
96    let utterance = normalised.as_str();
97
98    // No-separator path: pure-digit string of length 6 (DDMMYY) or 8 (DDMMYYYY).
99    if config.no_separator
100        && let Some(tokens) = try_tokenise_no_separator(utterance, &config.component_order)
101    {
102        return tokens;
103    }
104
105    // Standard separator path.
106    const STANDARD_SEPS: &[char] = &[' ', '\t', '\n', '\r', '/', '-', '.', ',', '\\'];
107
108    let mut separator_chars: Vec<char> = STANDARD_SEPS.to_vec();
109    let mut multi_char_separators: Vec<&str> = Vec::new();
110
111    for s in &config.extra_separators {
112        let mut chars = s.chars();
113        if let Some(first) = chars.next() {
114            if chars.next().is_none() {
115                separator_chars.push(first);
116            } else {
117                multi_char_separators.push(s.as_str());
118            }
119        }
120    }
121
122    let separator_ranges =
123        find_separator_ranges(utterance, &separator_chars, &multi_char_separators);
124    let raw_chunks = spans_between_separators(utterance, &separator_ranges);
125
126    // A date has at most three components, so we never need more than three
127    // tokens. Pre-allocating exactly 3 avoids any reallocation.
128    let mut tokens: Vec<Token> = Vec::with_capacity(3);
129
130    // TODO: Split this into it's own fn and add unit tests for it. It's doing a lot of work and has some non-trivial logic that deserves its own tests.
131    // Label the outer loop so the inner loop can break out of both at once
132    // when the token limit is reached (see the `break 'outer` below).
133    'outer: for chunk in raw_chunks {
134        // Skip chunks that contain no alphanumeric characters at all — e.g. a
135        // stray "!" or "--" that survived the separator pass. There is nothing
136        // here that could become a token.
137        if !chunk.chars().any(|c| c.is_alphanumeric()) {
138            continue;
139        }
140
141        // Letter-O substitution at the chunk level: when the entire chunk
142        // consists solely of ASCII digits and the letter O (upper or lower
143        // case), replace every O/o with '0' before boundary splitting.
144        //
145        // This is intentionally a whole-chunk check: "2O24" → "2024" (all
146        // chars are digit-or-O), but "7october" is left untouched because
147        // "october" contains characters other than O, so the chunk as a whole
148        // does not satisfy the all-digit-or-O predicate.
149        //
150        // Performing the substitution here, before sub_split_on_boundary, is
151        // essential: if we waited until after splitting, "2O24" would be
152        // fragmented into ["2", "O", "24"] at the digit↔alpha boundaries,
153        // producing three separate tokens instead of the single Numeric(2024).
154        let substituted_chunk: String;
155        let effective_chunk = if config.letter_o_substitution
156            && chunk
157                .chars()
158                .all(|c| c.is_ascii_digit() || c == 'o' || c == 'O')
159            && chunk.chars().any(|c| c == 'o' || c == 'O')
160        {
161            substituted_chunk = chunk
162                .chars()
163                .map(|c| if c == 'o' || c == 'O' { '0' } else { c })
164                .collect();
165            substituted_chunk.as_str()
166        } else {
167            chunk
168        };
169
170        // A chunk may contain a digit-to-alpha or alpha-to-digit boundary with
171        // no separator — e.g. "19october" or "August7". sub_split_on_boundary
172        // splits at those transitions so each run can be classified on its own.
173        // For a plain chunk like "2014" this produces a single-element vec, so
174        // the inner loop runs exactly once.
175        //
176        // Note: ordinal suffixes ("19th", "3rd") are intentionally NOT split —
177        // the boundary detector leaves them intact so classify() can recognise
178        // the whole thing as Token::OrdinalDay.
179        for sub in sub_split_on_boundary(effective_chunk) {
180            // Stop as soon as we have day, month, and year — there is nothing
181            // useful left to collect. `break 'outer` exits both loops at once;
182            // a plain `break` would only exit this inner loop and the outer
183            // loop would continue consuming chunks needlessly.
184            if tokens.len() == 3 {
185                break 'outer;
186            }
187
188            // classify() tries to turn the sub-slice into a Token::OrdinalDay,
189            // Token::Numeric, or Token::MonthName. Noise words ("the", "of")
190            // and unrecognised strings return None and are silently dropped —
191            // no error, no placeholder.
192            if let Some(token) = classify(sub) {
193                tokens.push(token);
194            }
195        }
196    }
197
198    tokens
199}
200
201/// Attempt to tokenise a no-separator pure-digit string by positional slicing.
202///
203/// Handles lengths 6 (two-digit year) and 8 (four-digit year). Returns `None`
204/// if the string is not purely digits or not one of the expected lengths.
205fn try_tokenise_no_separator(
206    utterance: &str,
207    order: &crate::models::ComponentOrder,
208) -> Option<Vec<Token>> {
209    let bytes = utterance.as_bytes();
210    if !bytes.iter().all(|b| b.is_ascii_digit()) {
211        return None;
212    }
213
214    // Determine slice widths: year slot gets 4 digits (8-char) or 2 (6-char).
215    let (year_width, total) = match bytes.len() {
216        8 => (4usize, 8usize),
217        6 => (2usize, 6usize),
218        _ => return None,
219    };
220
221    // Build (component, width) pairs in order.
222    let widths = [
223        (
224            order.first,
225            if order.first == DateComponent::Year {
226                year_width
227            } else {
228                2
229            },
230        ),
231        (
232            order.second,
233            if order.second == DateComponent::Year {
234                year_width
235            } else {
236                2
237            },
238        ),
239        (
240            order.third,
241            if order.third == DateComponent::Year {
242                year_width
243            } else {
244                2
245            },
246        ),
247    ];
248
249    // Verify widths sum to the total length.
250    let sum: usize = widths.iter().map(|(_, w)| w).sum();
251    if sum != total {
252        return None;
253    }
254
255    let mut pos = 0usize;
256    let mut tokens: Vec<Token> = Vec::with_capacity(3);
257    for (_, width) in &widths {
258        let slice = &utterance[pos..pos + width];
259        let digit_count = *width as u8;
260        let value: i16 = slice.parse().ok()?;
261        tokens.push(Token::Numeric(value, digit_count));
262        pos += width;
263    }
264    Some(tokens)
265}
266
267// ---------------------------------------------------------------------------
268// Separator range detection
269// ---------------------------------------------------------------------------
270
271#[derive(Debug, Clone, Copy, PartialEq, Eq)]
272struct SeparatorRange {
273    start: usize,
274    end: usize,
275}
276
277fn find_separator_ranges(
278    utterance: &str,
279    separator_chars: &[char],
280    multi_char_separators: &[&str],
281) -> Vec<SeparatorRange> {
282    let mut ranges: Vec<SeparatorRange> = Vec::new();
283
284    for (byte_pos, ch) in utterance.char_indices() {
285        if separator_chars.contains(&ch) {
286            ranges.push(SeparatorRange {
287                start: byte_pos,
288                end: byte_pos + ch.len_utf8(),
289            });
290        }
291    }
292
293    for separator in multi_char_separators {
294        let mut search_from = 0usize;
295        while let Some(pos) = utterance[search_from..].find(separator) {
296            let absolute_start = search_from + pos;
297            let absolute_end = absolute_start + separator.len();
298            ranges.push(SeparatorRange {
299                start: absolute_start,
300                end: absolute_end,
301            });
302            search_from = absolute_end;
303        }
304    }
305
306    ranges.sort_by_key(|r| r.start);
307    merge_ranges(ranges)
308}
309
310fn merge_ranges(sorted: Vec<SeparatorRange>) -> Vec<SeparatorRange> {
311    let mut merged: Vec<SeparatorRange> = Vec::with_capacity(sorted.len());
312    for r in sorted {
313        if let Some(last) = merged.last_mut()
314            && r.start <= last.end
315        {
316            last.end = last.end.max(r.end);
317            continue;
318        }
319        merged.push(r);
320    }
321    merged
322}
323
324fn spans_between_separators<'u>(
325    utterance: &'u str,
326    separator_ranges: &[SeparatorRange],
327) -> Vec<&'u str> {
328    let mut spans: Vec<&'u str> = Vec::new();
329    let mut pos = 0usize;
330
331    for separator in separator_ranges {
332        if pos < separator.start {
333            spans.push(&utterance[pos..separator.start]);
334        }
335        pos = separator.end;
336    }
337
338    if pos < utterance.len() {
339        spans.push(&utterance[pos..]);
340    }
341
342    spans
343}
344
345// ---------------------------------------------------------------------------
346// Digit↔alpha boundary splitting
347// ---------------------------------------------------------------------------
348
349fn sub_split_on_boundary(chunk: &str) -> Vec<&str> {
350    let bytes = chunk.as_bytes();
351    let mut parts: Vec<&str> = Vec::new();
352    let mut start = 0usize;
353
354    for i in 1..bytes.len() {
355        let prev_digit = bytes[i - 1].is_ascii_digit();
356        let curr_digit = bytes[i].is_ascii_digit();
357        let prev_alpha = bytes[i - 1].is_ascii_alphabetic();
358        let curr_alpha = bytes[i].is_ascii_alphabetic();
359
360        if (prev_digit && curr_alpha) || (prev_alpha && curr_digit) {
361            let tail = &chunk[i..];
362            let tail_lower = tail.to_ascii_lowercase();
363            if prev_digit && matches!(tail_lower.as_str(), "st" | "nd" | "rd" | "th") {
364                continue;
365            }
366            parts.push(&chunk[start..i]);
367            start = i;
368        }
369    }
370    parts.push(&chunk[start..]);
371    parts
372}
373
374// ---------------------------------------------------------------------------
375// Token classification
376// ---------------------------------------------------------------------------
377
378fn classify(sub: &str) -> Option<Token> {
379    if sub.is_empty() {
380        return None;
381    }
382
383    if let Some(token) = try_classify_ordinal(sub) {
384        return Some(token);
385    }
386
387    if sub.chars().all(|c| c.is_ascii_digit()) {
388        let digit_count = sub.len() as u8;
389        return sub
390            .parse::<i16>()
391            .ok()
392            .map(|v| Token::Numeric(v, digit_count));
393    }
394
395    if let Ok(month) = MonthName::try_from(sub) {
396        return Some(Token::MonthName(month));
397    }
398
399    None
400}
401
402fn try_classify_ordinal(sub: &str) -> Option<Token> {
403    let digit_end = sub
404        .char_indices()
405        .find(|(_, c)| !c.is_ascii_digit())
406        .map(|(i, _)| i)?;
407
408    if digit_end == 0 {
409        return None;
410    }
411
412    let suffix = &sub[digit_end..];
413    let suffix_lower = suffix.to_ascii_lowercase();
414
415    match suffix_lower.as_str() {
416        "st" | "nd" | "rd" | "th" => {
417            let day_number = sub[..digit_end].parse::<u8>().ok()?;
418            Some(Token::OrdinalDay(day_number))
419        }
420        _ => None,
421    }
422}
partial_date/tokeniser.rs

partial_date/
tokeniser.rs