use std::collections::HashSet;
pub struct TextUtils;
impl TextUtils {
pub fn tokenize(text: &str) -> Vec<String> {
text.to_lowercase()
.split_whitespace()
.map(|s| s.to_string())
.collect()
}
pub fn count_stopwords(text: &str, stopwords: &HashSet<String>) -> usize {
Self::tokenize(text)
.iter()
.filter(|token| stopwords.contains(*token))
.count()
}
pub fn stopword_density(text: &str, stopwords: &HashSet<String>) -> f32 {
let tokens = Self::tokenize(text);
if tokens.is_empty() {
return 0.0;
}
let stopword_count = tokens.iter()
.filter(|token| stopwords.contains(*token))
.count();
stopword_count as f32 / tokens.len() as f32
}
pub fn split_sentences(text: &str) -> Vec<String> {
use regex::Regex;
lazy_static::lazy_static! {
static ref SENTENCE_RE: Regex = Regex::new(r"[.!?]+").unwrap();
}
SENTENCE_RE
.split(text)
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect()
}
pub fn calculate_text_quality(text: &str, stopwords: &HashSet<String>) -> f32 {
if text.len() < 50 {
return 0.0;
}
let mut score = 0.0;
let tokens = Self::tokenize(text);
if tokens.is_empty() {
return 0.0;
}
let stopword_ratio = Self::count_stopwords(text, stopwords) as f32 / tokens.len() as f32;
if (0.35..=0.55).contains(&stopword_ratio) {
score += 0.3;
} else {
score += 0.3 * (1.0 - (stopword_ratio - 0.45).abs() / 0.45).max(0.0);
}
let sentences = Self::split_sentences(text);
if !sentences.is_empty() {
let avg_sentence_len = tokens.len() as f32 / sentences.len() as f32;
if (12.0..=28.0).contains(&avg_sentence_len) {
score += 0.2;
} else {
score += 0.2 * (1.0 - (avg_sentence_len - 20.0).abs() / 20.0).max(0.0);
}
}
let word_count = tokens.len();
if (100..=2000).contains(&word_count) {
score += 0.2;
} else if word_count > 50 {
score += 0.1;
}
let unique_words: HashSet<_> = tokens.iter().collect();
let diversity = unique_words.len() as f32 / tokens.len() as f32;
if (0.5..=0.8).contains(&diversity) {
score += 0.15;
}
let punct_count = text.chars().filter(|c| ".,!?;:".contains(*c)).count();
let punct_density = punct_count as f32 / text.len() as f32;
if (0.02..=0.08).contains(&punct_density) {
score += 0.15;
}
score.clamp(0.0, 1.0)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_tokenize() {
let text = "Hello World! This is a test.";
let tokens = TextUtils::tokenize(text);
assert_eq!(tokens, vec!["hello", "world!", "this", "is", "a", "test."]);
}
#[test]
fn test_quality_score() {
let stopwords: HashSet<_> = vec!["the", "a", "is", "this", "with", "and"]
.into_iter()
.map(|s| s.to_string())
.collect();
let good_text = "This is a well-written article with proper structure and excellent content. \
It contains multiple sentences with appropriate punctuation and varied vocabulary. \
The writing demonstrates clear communication and informative presentation. \
Articles should have sufficient length and proper paragraph organization. \
Quality content requires thoughtful composition and careful editing.";
let score = TextUtils::calculate_text_quality(good_text, &stopwords);
println!("Quality score: {}", score);
assert!(score > 0.24, "Expected score > 0.24, got {}", score); }
}