use unicode_segmentation::UnicodeSegmentation;
use crate::domain::types::StyloProfile;
#[must_use]
pub fn normalize_punctuation(text: &str) -> String {
let mut result = String::with_capacity(text.len());
for ch in text.chars() {
match ch {
'\u{2018}' | '\u{2019}' | '\u{201A}' | '\u{201B}' => result.push('\''),
'\u{201C}' | '\u{201D}' | '\u{201E}' | '\u{201F}' => result.push('"'),
'\u{2013}' | '\u{2014}' => result.push_str("--"),
'\u{2026}' => result.push_str("..."),
'\u{00A0}' => result.push(' '), _ => result.push(ch),
}
}
result
}
#[must_use]
pub fn expand_contractions(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let mut last_end = 0;
for (start, word) in text.unicode_word_indices() {
result.push_str(&text[last_end..start]);
last_end = start + word.len();
let lower = word.to_lowercase();
let expansion = match lower.as_str() {
"don't" => Some("do not"),
"doesn't" => Some("does not"),
"didn't" => Some("did not"),
"won't" => Some("will not"),
"wouldn't" => Some("would not"),
"couldn't" => Some("could not"),
"shouldn't" => Some("should not"),
"isn't" => Some("is not"),
"aren't" => Some("are not"),
"wasn't" => Some("was not"),
"weren't" => Some("were not"),
"haven't" => Some("have not"),
"hasn't" => Some("has not"),
"hadn't" => Some("had not"),
"can't" => Some("cannot"),
"it's" => Some("it is"),
"i'm" => Some("I am"),
"i've" => Some("I have"),
"i'll" => Some("I will"),
"i'd" => Some("I would"),
"we're" => Some("we are"),
"we've" => Some("we have"),
"we'll" => Some("we will"),
"they're" => Some("they are"),
"they've" => Some("they have"),
"they'll" => Some("they will"),
"you're" => Some("you are"),
"you've" => Some("you have"),
"you'll" => Some("you will"),
"he's" => Some("he is"),
"she's" => Some("she is"),
"that's" => Some("that is"),
"there's" => Some("there is"),
"here's" => Some("here is"),
"what's" => Some("what is"),
"who's" => Some("who is"),
"let's" => Some("let us"),
_ => None,
};
if let Some(expanded) = expansion {
result.push_str(expanded);
} else {
result.push_str(word);
}
}
if last_end < text.len() {
result.push_str(&text[last_end..]);
}
result
}
fn word_count(sentence: &str) -> usize {
sentence.unicode_words().count()
}
#[must_use]
pub fn scrub_text(text: &str, profile: &StyloProfile) -> String {
if text.is_empty() {
return String::new();
}
let mut result = text.to_owned();
if profile.normalize_punctuation {
result = normalize_punctuation(&result);
}
result = expand_contractions(&result);
result = collapse_whitespace(&result);
result
}
fn collapse_whitespace(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let mut prev_was_space = true; for ch in text.chars() {
if ch.is_whitespace() {
if !prev_was_space {
result.push(' ');
}
prev_was_space = true;
} else {
result.push(ch);
prev_was_space = false;
}
}
if result.ends_with(' ') {
result.pop();
}
result
}
#[must_use]
pub fn average_sentence_length(text: &str) -> f64 {
let sentences: Vec<&str> = text.split_sentence_bounds().collect();
let sentence_words: Vec<usize> = sentences
.iter()
.map(|s| word_count(s))
.filter(|&count| count > 0)
.collect();
if sentence_words.is_empty() {
return 0.0;
}
let total_words: usize = sentence_words.iter().sum();
#[expect(
clippy::cast_precision_loss,
reason = "sentence counts never exceed 2^52"
)]
{
total_words as f64 / sentence_words.len() as f64
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn punctuation_normalisation_smart_quotes() {
let input = "\u{201C}Hello,\u{201D} she said. \u{2018}Goodbye.\u{2019}";
let output = normalize_punctuation(input);
assert_eq!(output, "\"Hello,\" she said. 'Goodbye.'");
}
#[test]
fn punctuation_normalisation_em_dashes() {
let input = "word\u{2014}another";
let output = normalize_punctuation(input);
assert_eq!(output, "word--another");
}
#[test]
fn punctuation_normalisation_ellipsis() {
let input = "wait\u{2026}";
let output = normalize_punctuation(input);
assert_eq!(output, "wait...");
}
#[test]
fn contraction_expansion() {
let input = "I can't believe they're here.";
let output = expand_contractions(input);
assert!(output.contains("cannot"));
assert!(output.contains("they are"));
}
#[test]
fn scrub_idempotent() {
let profile = StyloProfile {
target_vocab_size: 5000,
target_avg_sentence_len: 15.0,
normalize_punctuation: true,
};
let input = "Don't worry\u{2014}it's fine!";
let once = scrub_text(input, &profile);
let twice = scrub_text(&once, &profile);
assert_eq!(once, twice);
}
#[test]
fn non_latin_passes_through() {
let profile = StyloProfile {
target_vocab_size: 5000,
target_avg_sentence_len: 15.0,
normalize_punctuation: true,
};
let arabic = "\u{0645}\u{0631}\u{062D}\u{0628}\u{0627} \u{0628}\u{0627}\u{0644}\u{0639}\u{0627}\u{0644}\u{0645}";
let output = scrub_text(arabic, &profile);
assert!(!output.is_empty());
}
#[test]
fn chinese_passes_through() {
let profile = StyloProfile {
target_vocab_size: 5000,
target_avg_sentence_len: 15.0,
normalize_punctuation: false,
};
let chinese = "\u{4F60}\u{597D}\u{4E16}\u{754C}";
let output = scrub_text(chinese, &profile);
assert_eq!(output, chinese);
}
#[test]
fn whitespace_collapse() {
let input = " hello world ";
let output = collapse_whitespace(input);
assert_eq!(output, "hello world");
}
#[test]
fn average_sentence_length_basic() {
let text = "Hello world. This is a test.";
let avg = average_sentence_length(text);
assert!(avg > 1.0);
}
#[test]
fn empty_text_scrubs_to_empty() {
let profile = StyloProfile {
target_vocab_size: 5000,
target_avg_sentence_len: 15.0,
normalize_punctuation: true,
};
let output = scrub_text("", &profile);
assert!(output.is_empty());
}
}