partial_date/tokeniser.rs
1//! Tokenisation: splitting and classifying utterance chunks into date tokens.
2
3use crate::models::{Config, DateComponent, MonthName, Token};
4use crate::word_numbers::replace_word_numbers;
5
6/// Split `utterance` on any standard separator or extra separator and classify
7/// each resulting chunk as a [`Token`].
8///
9/// # What counts as a separator
10///
11/// The standard separator set is: ASCII whitespace (space, tab, newline,
12/// carriage return), `/`, `-`, `.`, `,`, `\`. Any additional strings in
13/// `config.extra_separators` are also treated as separators.
14///
15/// When `config.no_separator` is `true` and the utterance is a pure digit
16/// string of length 6 or 8, it is sliced positionally according to
17/// `config.component_order` rather than split on separators.
18///
19/// # Classification
20///
21/// Each non-separator chunk is examined for digit-to-alpha (or alpha-to-digit)
22/// boundaries, allowing adjacent tokens like `"19october"` or `"August7"` to
23/// be split and classified independently.
24///
25/// - [`Token::OrdinalDay`] — digit run followed by `st`, `nd`, `rd`, or `th`.
26/// - [`Token::MonthName`] — full name, 3-letter abbreviation, unambiguous
27/// prefix, or fuzzy misspelling.
28/// - [`Token::Numeric`] — a run of ASCII digits; stores `(value, digit_count)`.
29/// - Anything else (noise words, stray punctuation) is silently discarded.
30///
31/// When [`Config::letter_o_substitution`] is `true` (the default), any token
32/// whose characters are all ASCII digits or the letter `O` (upper or lower
33/// case) is treated as a numeric token with every `O`/`o` replaced by `0`.
34/// This handles OCR and typing errors such as `"2O24"` → `2024`. The
35/// substitution applies only to isolated tokens; a letter O that is part of a
36/// longer alphabetic run (e.g. `"october"`) is never affected because
37/// `sub_split_on_boundary` has already separated digit and alpha runs.
38///
39/// At most **three** tokens are returned.
40///
41/// # Examples
42///
43/// ```
44/// use partial_date::extract::tokenise;
45/// use partial_date::models::{Config, MonthName, Token};
46///
47/// assert_eq!(
48/// tokenise("19 October 2014", &Config::default()),
49/// vec![
50/// Token::Numeric(19, 2),
51/// Token::MonthName(MonthName::October),
52/// Token::Numeric(2014, 4),
53/// ]
54/// );
55///
56/// assert_eq!(
57/// tokenise("19th October,2015", &Config::default()),
58/// vec![
59/// Token::OrdinalDay(19),
60/// Token::MonthName(MonthName::October),
61/// Token::Numeric(2015, 4),
62/// ]
63/// );
64///
65/// assert_eq!(
66/// tokenise("19october", &Config::default()),
67/// vec![
68/// Token::Numeric(19, 2),
69/// Token::MonthName(MonthName::October),
70/// ]
71/// );
72///
73/// // Letter O substitution (enabled by default):
74/// assert_eq!(
75/// tokenise("2O24", &Config::default()),
76/// vec![Token::Numeric(2024, 4)]
77/// );
78///
79/// // "7october" — the O is part of "october", not a standalone token, so
80/// // substitution does not apply and the month name is recognised normally.
81/// assert_eq!(
82/// tokenise("7october", &Config::default()),
83/// vec![
84/// Token::Numeric(7, 1),
85/// Token::MonthName(MonthName::October),
86/// ]
87/// );
88/// ```
89pub fn tokenise(utterance: &str, config: &Config) -> Vec<Token> {
90 // Replace any word-number spans (e.g. "twenty-three") with their digit
91 // equivalents before any further processing. This is done unconditionally
92 // so that "nineteen eighty-four" becomes "1984" and is then classified as a
93 // normal Numeric token. The replacement is non-destructive for utterances
94 // that contain no word numbers.
95 let normalised = replace_word_numbers(utterance);
96 let utterance = normalised.as_str();
97
98 // No-separator path: pure-digit string of length 6 (DDMMYY) or 8 (DDMMYYYY).
99 if config.no_separator
100 && let Some(tokens) = try_tokenise_no_separator(utterance, &config.component_order)
101 {
102 return tokens;
103 }
104
105 // Standard separator path.
106 const STANDARD_SEPS: &[char] = &[' ', '\t', '\n', '\r', '/', '-', '.', ',', '\\'];
107
108 let mut separator_chars: Vec<char> = STANDARD_SEPS.to_vec();
109 let mut multi_char_separators: Vec<&str> = Vec::new();
110
111 for s in &config.extra_separators {
112 let mut chars = s.chars();
113 if let Some(first) = chars.next() {
114 if chars.next().is_none() {
115 separator_chars.push(first);
116 } else {
117 multi_char_separators.push(s.as_str());
118 }
119 }
120 }
121
122 let separator_ranges =
123 find_separator_ranges(utterance, &separator_chars, &multi_char_separators);
124 let raw_chunks = spans_between_separators(utterance, &separator_ranges);
125
126 // A date has at most three components, so we never need more than three
127 // tokens. Pre-allocating exactly 3 avoids any reallocation.
128 let mut tokens: Vec<Token> = Vec::with_capacity(3);
129
130 // TODO: Split this into it's own fn and add unit tests for it. It's doing a lot of work and has some non-trivial logic that deserves its own tests.
131 // Label the outer loop so the inner loop can break out of both at once
132 // when the token limit is reached (see the `break 'outer` below).
133 'outer: for chunk in raw_chunks {
134 // Skip chunks that contain no alphanumeric characters at all — e.g. a
135 // stray "!" or "--" that survived the separator pass. There is nothing
136 // here that could become a token.
137 if !chunk.chars().any(|c| c.is_alphanumeric()) {
138 continue;
139 }
140
141 // Letter-O substitution at the chunk level: when the entire chunk
142 // consists solely of ASCII digits and the letter O (upper or lower
143 // case), replace every O/o with '0' before boundary splitting.
144 //
145 // This is intentionally a whole-chunk check: "2O24" → "2024" (all
146 // chars are digit-or-O), but "7october" is left untouched because
147 // "october" contains characters other than O, so the chunk as a whole
148 // does not satisfy the all-digit-or-O predicate.
149 //
150 // Performing the substitution here, before sub_split_on_boundary, is
151 // essential: if we waited until after splitting, "2O24" would be
152 // fragmented into ["2", "O", "24"] at the digit↔alpha boundaries,
153 // producing three separate tokens instead of the single Numeric(2024).
154 let substituted_chunk: String;
155 let effective_chunk = if config.letter_o_substitution
156 && chunk
157 .chars()
158 .all(|c| c.is_ascii_digit() || c == 'o' || c == 'O')
159 && chunk.chars().any(|c| c == 'o' || c == 'O')
160 {
161 substituted_chunk = chunk
162 .chars()
163 .map(|c| if c == 'o' || c == 'O' { '0' } else { c })
164 .collect();
165 substituted_chunk.as_str()
166 } else {
167 chunk
168 };
169
170 // A chunk may contain a digit-to-alpha or alpha-to-digit boundary with
171 // no separator — e.g. "19october" or "August7". sub_split_on_boundary
172 // splits at those transitions so each run can be classified on its own.
173 // For a plain chunk like "2014" this produces a single-element vec, so
174 // the inner loop runs exactly once.
175 //
176 // Note: ordinal suffixes ("19th", "3rd") are intentionally NOT split —
177 // the boundary detector leaves them intact so classify() can recognise
178 // the whole thing as Token::OrdinalDay.
179 for sub in sub_split_on_boundary(effective_chunk) {
180 // Stop as soon as we have day, month, and year — there is nothing
181 // useful left to collect. `break 'outer` exits both loops at once;
182 // a plain `break` would only exit this inner loop and the outer
183 // loop would continue consuming chunks needlessly.
184 if tokens.len() == 3 {
185 break 'outer;
186 }
187
188 // classify() tries to turn the sub-slice into a Token::OrdinalDay,
189 // Token::Numeric, or Token::MonthName. Noise words ("the", "of")
190 // and unrecognised strings return None and are silently dropped —
191 // no error, no placeholder.
192 if let Some(token) = classify(sub) {
193 tokens.push(token);
194 }
195 }
196 }
197
198 tokens
199}
200
201/// Attempt to tokenise a no-separator pure-digit string by positional slicing.
202///
203/// Handles lengths 6 (two-digit year) and 8 (four-digit year). Returns `None`
204/// if the string is not purely digits or not one of the expected lengths.
205fn try_tokenise_no_separator(
206 utterance: &str,
207 order: &crate::models::ComponentOrder,
208) -> Option<Vec<Token>> {
209 let bytes = utterance.as_bytes();
210 if !bytes.iter().all(|b| b.is_ascii_digit()) {
211 return None;
212 }
213
214 // Determine slice widths: year slot gets 4 digits (8-char) or 2 (6-char).
215 let (year_width, total) = match bytes.len() {
216 8 => (4usize, 8usize),
217 6 => (2usize, 6usize),
218 _ => return None,
219 };
220
221 // Build (component, width) pairs in order.
222 let widths = [
223 (
224 order.first,
225 if order.first == DateComponent::Year {
226 year_width
227 } else {
228 2
229 },
230 ),
231 (
232 order.second,
233 if order.second == DateComponent::Year {
234 year_width
235 } else {
236 2
237 },
238 ),
239 (
240 order.third,
241 if order.third == DateComponent::Year {
242 year_width
243 } else {
244 2
245 },
246 ),
247 ];
248
249 // Verify widths sum to the total length.
250 let sum: usize = widths.iter().map(|(_, w)| w).sum();
251 if sum != total {
252 return None;
253 }
254
255 let mut pos = 0usize;
256 let mut tokens: Vec<Token> = Vec::with_capacity(3);
257 for (_, width) in &widths {
258 let slice = &utterance[pos..pos + width];
259 let digit_count = *width as u8;
260 let value: i16 = slice.parse().ok()?;
261 tokens.push(Token::Numeric(value, digit_count));
262 pos += width;
263 }
264 Some(tokens)
265}
266
267// ---------------------------------------------------------------------------
268// Separator range detection
269// ---------------------------------------------------------------------------
270
271#[derive(Debug, Clone, Copy, PartialEq, Eq)]
272struct SeparatorRange {
273 start: usize,
274 end: usize,
275}
276
277fn find_separator_ranges(
278 utterance: &str,
279 separator_chars: &[char],
280 multi_char_separators: &[&str],
281) -> Vec<SeparatorRange> {
282 let mut ranges: Vec<SeparatorRange> = Vec::new();
283
284 for (byte_pos, ch) in utterance.char_indices() {
285 if separator_chars.contains(&ch) {
286 ranges.push(SeparatorRange {
287 start: byte_pos,
288 end: byte_pos + ch.len_utf8(),
289 });
290 }
291 }
292
293 for separator in multi_char_separators {
294 let mut search_from = 0usize;
295 while let Some(pos) = utterance[search_from..].find(separator) {
296 let absolute_start = search_from + pos;
297 let absolute_end = absolute_start + separator.len();
298 ranges.push(SeparatorRange {
299 start: absolute_start,
300 end: absolute_end,
301 });
302 search_from = absolute_end;
303 }
304 }
305
306 ranges.sort_by_key(|r| r.start);
307 merge_ranges(ranges)
308}
309
310fn merge_ranges(sorted: Vec<SeparatorRange>) -> Vec<SeparatorRange> {
311 let mut merged: Vec<SeparatorRange> = Vec::with_capacity(sorted.len());
312 for r in sorted {
313 if let Some(last) = merged.last_mut()
314 && r.start <= last.end
315 {
316 last.end = last.end.max(r.end);
317 continue;
318 }
319 merged.push(r);
320 }
321 merged
322}
323
324fn spans_between_separators<'u>(
325 utterance: &'u str,
326 separator_ranges: &[SeparatorRange],
327) -> Vec<&'u str> {
328 let mut spans: Vec<&'u str> = Vec::new();
329 let mut pos = 0usize;
330
331 for separator in separator_ranges {
332 if pos < separator.start {
333 spans.push(&utterance[pos..separator.start]);
334 }
335 pos = separator.end;
336 }
337
338 if pos < utterance.len() {
339 spans.push(&utterance[pos..]);
340 }
341
342 spans
343}
344
345// ---------------------------------------------------------------------------
346// Digit↔alpha boundary splitting
347// ---------------------------------------------------------------------------
348
349fn sub_split_on_boundary(chunk: &str) -> Vec<&str> {
350 let bytes = chunk.as_bytes();
351 let mut parts: Vec<&str> = Vec::new();
352 let mut start = 0usize;
353
354 for i in 1..bytes.len() {
355 let prev_digit = bytes[i - 1].is_ascii_digit();
356 let curr_digit = bytes[i].is_ascii_digit();
357 let prev_alpha = bytes[i - 1].is_ascii_alphabetic();
358 let curr_alpha = bytes[i].is_ascii_alphabetic();
359
360 if (prev_digit && curr_alpha) || (prev_alpha && curr_digit) {
361 let tail = &chunk[i..];
362 let tail_lower = tail.to_ascii_lowercase();
363 if prev_digit && matches!(tail_lower.as_str(), "st" | "nd" | "rd" | "th") {
364 continue;
365 }
366 parts.push(&chunk[start..i]);
367 start = i;
368 }
369 }
370 parts.push(&chunk[start..]);
371 parts
372}
373
374// ---------------------------------------------------------------------------
375// Token classification
376// ---------------------------------------------------------------------------
377
378fn classify(sub: &str) -> Option<Token> {
379 if sub.is_empty() {
380 return None;
381 }
382
383 if let Some(token) = try_classify_ordinal(sub) {
384 return Some(token);
385 }
386
387 if sub.chars().all(|c| c.is_ascii_digit()) {
388 let digit_count = sub.len() as u8;
389 return sub
390 .parse::<i16>()
391 .ok()
392 .map(|v| Token::Numeric(v, digit_count));
393 }
394
395 if let Ok(month) = MonthName::try_from(sub) {
396 return Some(Token::MonthName(month));
397 }
398
399 None
400}
401
402fn try_classify_ordinal(sub: &str) -> Option<Token> {
403 let digit_end = sub
404 .char_indices()
405 .find(|(_, c)| !c.is_ascii_digit())
406 .map(|(i, _)| i)?;
407
408 if digit_end == 0 {
409 return None;
410 }
411
412 let suffix = &sub[digit_end..];
413 let suffix_lower = suffix.to_ascii_lowercase();
414
415 match suffix_lower.as_str() {
416 "st" | "nd" | "rd" | "th" => {
417 let day_number = sub[..digit_end].parse::<u8>().ok()?;
418 Some(Token::OrdinalDay(day_number))
419 }
420 _ => None,
421 }
422}