use crate::models::{Config, DateComponent, MonthName, Token};
use crate::word_numbers::replace_word_numbers;
pub fn tokenise(utterance: &str, config: &Config) -> Vec<Token> {
let normalised = replace_word_numbers(utterance);
let utterance = normalised.as_str();
if config.no_separator
&& let Some(tokens) = try_tokenise_no_separator(utterance, &config.component_order)
{
return tokens;
}
const STANDARD_SEPS: &[char] = &[' ', '\t', '\n', '\r', '/', '-', '.', ',', '\\'];
let mut separator_chars: Vec<char> = STANDARD_SEPS.to_vec();
let mut multi_char_separators: Vec<&str> = Vec::new();
for s in &config.extra_separators {
let mut chars = s.chars();
if let Some(first) = chars.next() {
if chars.next().is_none() {
separator_chars.push(first);
} else {
multi_char_separators.push(s.as_str());
}
}
}
let separator_ranges =
find_separator_ranges(utterance, &separator_chars, &multi_char_separators);
let raw_chunks = spans_between_separators(utterance, &separator_ranges);
let mut tokens: Vec<Token> = Vec::with_capacity(3);
'outer: for chunk in raw_chunks {
if !chunk.chars().any(|c| c.is_alphanumeric()) {
continue;
}
let substituted_chunk: String;
let effective_chunk = if config.letter_o_substitution
&& chunk
.chars()
.all(|c| c.is_ascii_digit() || c == 'o' || c == 'O')
&& chunk.chars().any(|c| c == 'o' || c == 'O')
{
substituted_chunk = chunk
.chars()
.map(|c| if c == 'o' || c == 'O' { '0' } else { c })
.collect();
substituted_chunk.as_str()
} else {
chunk
};
for sub in sub_split_on_boundary(effective_chunk) {
if tokens.len() == 3 {
break 'outer;
}
if let Some(token) = classify(sub) {
tokens.push(token);
}
}
}
tokens
}
fn try_tokenise_no_separator(
utterance: &str,
order: &crate::models::ComponentOrder,
) -> Option<Vec<Token>> {
let bytes = utterance.as_bytes();
if !bytes.iter().all(|b| b.is_ascii_digit()) {
return None;
}
let (year_width, total) = match bytes.len() {
8 => (4usize, 8usize),
6 => (2usize, 6usize),
_ => return None,
};
let widths = [
(
order.first,
if order.first == DateComponent::Year {
year_width
} else {
2
},
),
(
order.second,
if order.second == DateComponent::Year {
year_width
} else {
2
},
),
(
order.third,
if order.third == DateComponent::Year {
year_width
} else {
2
},
),
];
let sum: usize = widths.iter().map(|(_, w)| w).sum();
if sum != total {
return None;
}
let mut pos = 0usize;
let mut tokens: Vec<Token> = Vec::with_capacity(3);
for (_, width) in &widths {
let slice = &utterance[pos..pos + width];
let digit_count = *width as u8;
let value: i16 = slice.parse().ok()?;
tokens.push(Token::Numeric(value, digit_count));
pos += width;
}
Some(tokens)
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
struct SeparatorRange {
start: usize,
end: usize,
}
fn find_separator_ranges(
utterance: &str,
separator_chars: &[char],
multi_char_separators: &[&str],
) -> Vec<SeparatorRange> {
let mut ranges: Vec<SeparatorRange> = Vec::new();
for (byte_pos, ch) in utterance.char_indices() {
if separator_chars.contains(&ch) {
ranges.push(SeparatorRange {
start: byte_pos,
end: byte_pos + ch.len_utf8(),
});
}
}
for separator in multi_char_separators {
let mut search_from = 0usize;
while let Some(pos) = utterance[search_from..].find(separator) {
let absolute_start = search_from + pos;
let absolute_end = absolute_start + separator.len();
ranges.push(SeparatorRange {
start: absolute_start,
end: absolute_end,
});
search_from = absolute_end;
}
}
ranges.sort_by_key(|r| r.start);
merge_ranges(ranges)
}
fn merge_ranges(sorted: Vec<SeparatorRange>) -> Vec<SeparatorRange> {
let mut merged: Vec<SeparatorRange> = Vec::with_capacity(sorted.len());
for r in sorted {
if let Some(last) = merged.last_mut()
&& r.start <= last.end
{
last.end = last.end.max(r.end);
continue;
}
merged.push(r);
}
merged
}
fn spans_between_separators<'u>(
utterance: &'u str,
separator_ranges: &[SeparatorRange],
) -> Vec<&'u str> {
let mut spans: Vec<&'u str> = Vec::new();
let mut pos = 0usize;
for separator in separator_ranges {
if pos < separator.start {
spans.push(&utterance[pos..separator.start]);
}
pos = separator.end;
}
if pos < utterance.len() {
spans.push(&utterance[pos..]);
}
spans
}
fn sub_split_on_boundary(chunk: &str) -> Vec<&str> {
let bytes = chunk.as_bytes();
let mut parts: Vec<&str> = Vec::new();
let mut start = 0usize;
for i in 1..bytes.len() {
let prev_digit = bytes[i - 1].is_ascii_digit();
let curr_digit = bytes[i].is_ascii_digit();
let prev_alpha = bytes[i - 1].is_ascii_alphabetic();
let curr_alpha = bytes[i].is_ascii_alphabetic();
if (prev_digit && curr_alpha) || (prev_alpha && curr_digit) {
let tail = &chunk[i..];
let tail_lower = tail.to_ascii_lowercase();
if prev_digit && matches!(tail_lower.as_str(), "st" | "nd" | "rd" | "th") {
continue;
}
parts.push(&chunk[start..i]);
start = i;
}
}
parts.push(&chunk[start..]);
parts
}
fn classify(sub: &str) -> Option<Token> {
if sub.is_empty() {
return None;
}
if let Some(token) = try_classify_ordinal(sub) {
return Some(token);
}
if sub.chars().all(|c| c.is_ascii_digit()) {
let digit_count = sub.len() as u8;
return sub
.parse::<i16>()
.ok()
.map(|v| Token::Numeric(v, digit_count));
}
if let Ok(month) = MonthName::try_from(sub) {
return Some(Token::MonthName(month));
}
None
}
fn try_classify_ordinal(sub: &str) -> Option<Token> {
let digit_end = sub
.char_indices()
.find(|(_, c)| !c.is_ascii_digit())
.map(|(i, _)| i)?;
if digit_end == 0 {
return None;
}
let suffix = &sub[digit_end..];
let suffix_lower = suffix.to_ascii_lowercase();
match suffix_lower.as_str() {
"st" | "nd" | "rd" | "th" => {
let day_number = sub[..digit_end].parse::<u8>().ok()?;
Some(Token::OrdinalDay(day_number))
}
_ => None,
}
}