use alloc::string::String;
use tracing::trace;
pub const COMMA_PAUSE: &str = ",pause";
pub const PERIOD_PAUSE: &str = ".pause";
pub const EMPHASIS_START: &str = "<emph>";
pub const EMPHASIS_END: &str = "</emph>";
#[must_use]
pub fn normalize(text: &str) -> String {
let abbr_expanded = expand_abbreviations(text);
let acronym_expanded = expand_acronyms(&abbr_expanded);
let expanded = expand_numbers(&acronym_expanded);
let mut result = String::with_capacity(expanded.len());
let mut prev_space = false;
for ch in expanded.chars() {
if ch.is_alphabetic() || ch == '\'' || ch == '-' {
if prev_space && !result.is_empty() {
result.push(' ');
}
result.push(ch.to_lowercase().next().unwrap_or(ch));
prev_space = false;
} else if ch == ',' {
result.push_str(" ,pause");
prev_space = true;
} else if ch == '.' || ch == ';' {
result.push_str(" .pause");
prev_space = true;
} else if ch.is_whitespace() || ch == '!' || ch == '?' {
prev_space = true;
}
}
result
}
#[must_use]
pub fn normalize_with_emphasis(text: &str) -> String {
let abbr_expanded = expand_abbreviations(text);
let expanded = expand_numbers(&abbr_expanded);
let tokens: alloc::vec::Vec<&str> = expanded.split_whitespace().collect();
let mut result = String::with_capacity(expanded.len() + tokens.len() * 8);
for (i, token) in tokens.iter().enumerate() {
if i > 0 && !result.is_empty() {
result.push(' ');
}
if token.len() >= 3 && token.starts_with('*') && token.ends_with('*') {
let inner = &token[1..token.len() - 1];
if !inner.is_empty() && inner.chars().all(|c| c.is_alphabetic()) {
result.push_str(EMPHASIS_START);
result.push(' ');
for ch in inner.chars() {
result.push(ch.to_lowercase().next().unwrap_or(ch));
}
result.push(' ');
result.push_str(EMPHASIS_END);
continue;
}
}
let alpha_chars: alloc::vec::Vec<char> =
token.chars().filter(|c| c.is_alphabetic()).collect();
if alpha_chars.len() >= 3 && alpha_chars.iter().all(|c| c.is_uppercase()) {
result.push_str(EMPHASIS_START);
result.push(' ');
normalize_token_into(token, &mut result);
result.push(' ');
result.push_str(EMPHASIS_END);
continue;
}
normalize_token_into(token, &mut result);
}
result
}
fn normalize_token_into(token: &str, result: &mut String) {
let mut prev_space = false;
for ch in token.chars() {
if ch.is_alphabetic() || ch == '\'' || ch == '-' {
if prev_space && !result.is_empty() && !result.ends_with(' ') {
result.push(' ');
}
result.push(ch.to_lowercase().next().unwrap_or(ch));
prev_space = false;
} else if ch == ',' {
result.push_str(" ,pause");
prev_space = true;
} else if ch == '.' || ch == ';' {
result.push_str(" .pause");
prev_space = true;
} else if ch.is_whitespace() || ch == '!' || ch == '?' {
prev_space = true;
}
}
}
#[must_use]
pub fn expand_abbreviations(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let words: alloc::vec::Vec<&str> = text.split_whitespace().collect();
for (i, word) in words.iter().enumerate() {
if i > 0 {
result.push(' ');
}
if let Some((expansion, consumed)) = match_abbreviation(word) {
result.push_str(expansion);
result.push_str(&word[consumed..]);
} else {
result.push_str(word);
}
}
result
}
fn match_abbreviation(text: &str) -> Option<(&'static str, usize)> {
static ABBREVIATIONS: &[(&str, &str)] = &[
("approx.", "approximately"),
("blvd.", "boulevard"),
("capt.", "captain"),
("col.", "colonel"),
("corp.", "corporation"),
("dept.", "department"),
("govt.", "government"),
("prof.", "professor"),
("mrs.", "missus"),
("mr.", "mister"),
("dr.", "doctor"),
("sr.", "senior"),
("jr.", "junior"),
("st.", "saint"),
("ave.", "avenue"),
("etc.", "et cetera"),
("vs.", "versus"),
("inc.", "incorporated"),
("ltd.", "limited"),
("gen.", "general"),
("sgt.", "sergeant"),
("lt.", "lieutenant"),
("pt.", "point"),
("ft.", "feet"),
("mt.", "mount"),
];
let lower = text.to_lowercase();
for &(abbr, expansion) in ABBREVIATIONS {
if lower.starts_with(abbr) {
let after = &text[abbr.len()..];
if after.is_empty()
|| after.starts_with(|c: char| c.is_whitespace() || !c.is_alphabetic())
{
return Some((expansion, abbr.len()));
}
}
}
None
}
#[must_use]
pub fn expand_acronyms(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let words: alloc::vec::Vec<&str> = text.split_whitespace().collect();
for (i, word) in words.iter().enumerate() {
if i > 0 {
result.push(' ');
}
let (core, trailing) = split_trailing_punct(word);
if is_acronym(core) {
if is_pronounceable_acronym(core) {
for ch in core.chars() {
result.push(ch.to_lowercase().next().unwrap_or(ch));
}
} else {
for (j, ch) in core.chars().enumerate() {
if j > 0 {
result.push(' ');
}
result.push(ch.to_lowercase().next().unwrap_or(ch));
}
}
result.push_str(trailing);
} else {
result.push_str(word);
}
}
result
}
fn split_trailing_punct(word: &str) -> (&str, &str) {
let end = word
.char_indices()
.rev()
.take_while(|(_, c)| !c.is_alphanumeric())
.last()
.map(|(i, _)| i)
.unwrap_or(word.len());
(&word[..end], &word[end..])
}
fn is_acronym(word: &str) -> bool {
let len = word.len();
(3..=5).contains(&len) && word.chars().all(|c| c.is_ascii_uppercase())
}
fn is_pronounceable_acronym(word: &str) -> bool {
let lower: String = word.to_lowercase();
let chars: alloc::vec::Vec<char> = lower.chars().collect();
fn is_vowel(c: char) -> bool {
matches!(c, 'a' | 'e' | 'i' | 'o' | 'u')
}
if !chars.iter().any(|c| is_vowel(*c)) {
return false;
}
if is_vowel(chars[0]) {
return true;
}
if chars.len() >= 2 && !is_vowel(chars[0]) && is_vowel(chars[1]) {
return true;
}
if chars.len() >= 3 && !is_vowel(chars[0]) && !is_vowel(chars[1]) && is_vowel(chars[2]) {
let cluster = (chars[0], chars[1]);
let valid_onsets = [
('b', 'l'),
('b', 'r'),
('c', 'l'),
('c', 'r'),
('d', 'r'),
('f', 'l'),
('f', 'r'),
('g', 'l'),
('g', 'r'),
('p', 'l'),
('p', 'r'),
('s', 'c'),
('s', 'k'),
('s', 'l'),
('s', 'm'),
('s', 'n'),
('s', 'p'),
('s', 't'),
('s', 'w'),
('t', 'r'),
('t', 'w'),
('t', 'h'),
('s', 'h'),
('c', 'h'),
('w', 'h'),
];
if valid_onsets.contains(&cluster) {
return true;
}
}
false
}
#[must_use]
pub fn is_foreign_word(word: &str) -> bool {
word.chars()
.any(|c| c.is_alphabetic() && !c.is_ascii_alphabetic() && is_latin_extended(c))
}
fn is_latin_extended(c: char) -> bool {
let cp = c as u32;
(0x00C0..=0x024F).contains(&cp)
}
#[must_use]
pub fn strip_diacritics(word: &str) -> String {
word.chars()
.map(|c| match c {
'á' | 'à' | 'â' | 'ä' | 'ã' => 'a',
'é' | 'è' | 'ê' | 'ë' => 'e',
'í' | 'ì' | 'î' | 'ï' => 'i',
'ó' | 'ò' | 'ô' | 'ö' | 'õ' => 'o',
'ú' | 'ù' | 'û' | 'ü' => 'u',
'ñ' => 'n',
'ç' => 's',
_ => c,
})
.collect()
}
#[must_use]
pub fn expand_numbers(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let chars: alloc::vec::Vec<char> = text.chars().collect();
let mut i = 0;
while i < chars.len() {
if chars[i] == '-' && i + 1 < chars.len() && chars[i + 1].is_ascii_digit() {
i += 1; let (num_str, consumed) = collect_number(&chars[i..]);
let expanded = expand_number_token(&num_str);
trace!(
num_str = num_str.as_str(),
expanded = expanded.as_str(),
"expanded negative number"
);
if !result.is_empty() && !result.ends_with(' ') {
result.push(' ');
}
result.push_str("negative ");
result.push_str(&expanded);
i += consumed;
continue;
}
if chars[i].is_ascii_digit() {
let (num_str, consumed) = collect_number(&chars[i..]);
let expanded = expand_number_token(&num_str);
trace!(
num_str = num_str.as_str(),
expanded = expanded.as_str(),
"expanded number"
);
if !result.is_empty() && !result.ends_with(' ') {
result.push(' ');
}
result.push_str(&expanded);
i += consumed;
} else {
result.push(chars[i]);
i += 1;
}
}
result
}
fn collect_number(chars: &[char]) -> (String, usize) {
let mut s = String::new();
let mut i = 0;
let mut has_dot = false;
while i < chars.len() {
if chars[i].is_ascii_digit() {
s.push(chars[i]);
i += 1;
} else if chars[i] == ',' && i + 1 < chars.len() && chars[i + 1].is_ascii_digit() {
i += 1;
} else if chars[i] == '.'
&& !has_dot
&& i + 1 < chars.len()
&& chars[i + 1].is_ascii_digit()
{
s.push('.');
has_dot = true;
i += 1;
} else {
break;
}
}
(s, i)
}
fn expand_number_token(token: &str) -> String {
if let Some(dot_pos) = token.find('.') {
let integer_part = &token[..dot_pos];
let decimal_part = &token[dot_pos + 1..];
let mut result = if integer_part.is_empty() {
String::from("zero")
} else if let Ok(n) = integer_part.parse::<u64>() {
number_to_words(n)
} else {
digits_to_words(integer_part)
};
result.push_str(" point");
for ch in decimal_part.chars() {
if let Some(d) = ch.to_digit(10) {
result.push(' ');
result.push_str(ONES[d as usize]);
}
}
result
} else if let Ok(n) = token.parse::<u64>() {
number_to_words(n)
} else {
digits_to_words(token)
}
}
fn digits_to_words(s: &str) -> String {
let mut result = String::new();
for ch in s.chars() {
if let Some(d) = ch.to_digit(10) {
if !result.is_empty() {
result.push(' ');
}
result.push_str(ONES[d as usize]);
}
}
result
}
fn number_to_words(n: u64) -> String {
if n == 0 {
return String::from("zero");
}
let mut parts = alloc::vec::Vec::new();
if n >= 1_000_000_000 {
let billions = n / 1_000_000_000;
parts.push(alloc::format!("{} billion", number_to_words(billions)));
}
let remainder = n % 1_000_000_000;
if remainder >= 1_000_000 {
let millions = remainder / 1_000_000;
parts.push(alloc::format!("{} million", number_to_words(millions)));
}
let remainder = remainder % 1_000_000;
if remainder >= 1000 {
let thousands = remainder / 1000;
parts.push(alloc::format!("{} thousand", number_to_words(thousands)));
}
let remainder = remainder % 1000;
if remainder > 0 {
parts.push(hundreds_to_words(remainder));
}
parts.join(" ")
}
fn hundreds_to_words(n: u64) -> String {
let mut parts = alloc::vec::Vec::new();
if n >= 100 {
parts.push(alloc::format!("{} hundred", ONES[(n / 100) as usize]));
}
let remainder = n % 100;
if remainder > 0 {
parts.push(tens_to_words(remainder));
}
parts.join(" ")
}
fn tens_to_words(n: u64) -> String {
if n < 20 {
return String::from(ONES[n as usize]);
}
let ten = TENS[(n / 10) as usize];
let one = n % 10;
if one > 0 {
alloc::format!("{ten} {}", ONES[one as usize])
} else {
String::from(ten)
}
}
static ONES: &[&str] = &[
"zero",
"one",
"two",
"three",
"four",
"five",
"six",
"seven",
"eight",
"nine",
"ten",
"eleven",
"twelve",
"thirteen",
"fourteen",
"fifteen",
"sixteen",
"seventeen",
"eighteen",
"nineteen",
];
static TENS: &[&str] = &[
"", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety",
];
#[must_use]
pub fn detect_intonation(text: &str) -> SentenceType {
let trimmed = text.trim_end();
if trimmed.ends_with('?') {
SentenceType::Question
} else if trimmed.ends_with('!') {
SentenceType::Exclamation
} else {
SentenceType::Statement
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[non_exhaustive]
pub enum SentenceType {
Statement,
Question,
Exclamation,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_normalize_basic() {
assert_eq!(normalize("Hello World!"), "hello world");
}
#[test]
fn test_normalize_punctuation() {
assert_eq!(normalize("it's a test."), "it's a test .pause");
}
#[test]
fn test_normalize_multiple_spaces() {
assert_eq!(normalize("too many spaces"), "too many spaces");
}
#[test]
fn test_detect_intonation() {
assert_eq!(detect_intonation("hello?"), SentenceType::Question);
assert_eq!(detect_intonation("wow!"), SentenceType::Exclamation);
assert_eq!(detect_intonation("ok."), SentenceType::Statement);
}
#[test]
fn test_expand_zero() {
assert_eq!(expand_numbers("0"), "zero");
}
#[test]
fn test_expand_small() {
assert_eq!(expand_numbers("42"), "forty two");
}
#[test]
fn test_expand_hundred() {
assert_eq!(expand_numbers("100"), "one hundred");
}
#[test]
fn test_expand_thousand() {
assert_eq!(expand_numbers("1000"), "one thousand");
}
#[test]
fn test_expand_complex() {
assert_eq!(
expand_numbers("1234"),
"one thousand two hundred thirty four"
);
}
#[test]
fn test_expand_with_comma() {
assert_eq!(expand_numbers("1,000"), "one thousand");
}
#[test]
fn test_expand_decimal() {
assert_eq!(expand_numbers("3.14"), "three point one four");
}
#[test]
fn test_expand_negative() {
assert_eq!(expand_numbers("-5"), "negative five");
}
#[test]
fn test_expand_in_sentence() {
assert_eq!(expand_numbers("I have 42 cats"), "I have forty two cats");
}
#[test]
fn test_expand_no_numbers() {
assert_eq!(expand_numbers("no numbers here"), "no numbers here");
}
#[test]
fn test_normalize_with_numbers() {
assert_eq!(normalize("I have 42 cats!"), "i have forty two cats");
}
#[test]
fn test_normalize_comma_pause() {
let result = normalize("hello, world");
assert!(result.contains(COMMA_PAUSE));
}
#[test]
fn test_normalize_period_pause() {
let result = normalize("first. second");
assert!(result.contains(PERIOD_PAUSE));
}
#[test]
fn test_expand_abbreviation_dr() {
assert_eq!(expand_abbreviations("Dr. Smith"), "doctor Smith");
}
#[test]
fn test_expand_abbreviation_mr() {
assert_eq!(expand_abbreviations("Mr. Jones"), "mister Jones");
}
#[test]
fn test_abbreviation_not_mid_word() {
assert_eq!(expand_abbreviations("test."), "test.");
}
#[test]
fn test_abbreviation_in_normalize() {
let result = normalize("Dr. Smith is here");
assert!(result.contains("doctor"), "Dr. should expand to doctor");
}
#[test]
fn test_acronym_spell_out() {
assert_eq!(expand_acronyms("FBI"), "f b i");
}
#[test]
fn test_acronym_pronounceable() {
assert_eq!(expand_acronyms("NASA"), "nasa");
}
#[test]
fn test_acronym_short_not_matched() {
assert_eq!(expand_acronyms("I am OK"), "I am OK");
}
#[test]
fn test_acronym_in_sentence() {
assert_eq!(expand_acronyms("the FBI and NASA"), "the f b i and nasa");
}
#[test]
fn test_foreign_word_detection() {
assert!(is_foreign_word("café"));
assert!(is_foreign_word("naïve"));
assert!(!is_foreign_word("hello"));
}
#[test]
fn test_strip_diacritics() {
assert_eq!(strip_diacritics("café"), "cafe");
assert_eq!(strip_diacritics("naïve"), "naive");
assert_eq!(strip_diacritics("résumé"), "resume");
}
}