use std::collections::HashSet;
pub struct TextUtils;
impl TextUtils {
pub fn tokenize(text: &str) -> Vec<String> {
text.to_lowercase()
.split_whitespace()
.map(|s| s.to_string())
.collect()
}
pub fn count_stopwords(text: &str, stopwords: &HashSet<String>) -> usize {
Self::tokenize(text)
.iter()
.filter(|token| stopwords.contains(*token))
.count()
}
pub fn stopword_density(text: &str, stopwords: &HashSet<String>) -> f32 {
let tokens = Self::tokenize(text);
if tokens.is_empty() {
return 0.0;
}
let stopword_count = tokens.iter()
.filter(|token| stopwords.contains(*token))
.count();
stopword_count as f32 / tokens.len() as f32
}
pub fn split_sentences(text: &str) -> Vec<String> {
use regex::Regex;
lazy_static::lazy_static! {
static ref SENTENCE_RE: Regex = Regex::new(r"[.!?]+").unwrap();
}
SENTENCE_RE
.split(text)
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect()
}
pub fn token_f1(extracted: &str, reference: &str, stopwords: &HashSet<String>) -> f32 {
let norm = |t: &str| -> HashSet<String> {
Self::tokenize(t)
.into_iter()
.filter(|w| w.len() > 2 && !stopwords.contains(w))
.collect()
};
let ext = norm(extracted);
let ref_set = norm(reference);
if ref_set.is_empty() || ext.is_empty() {
return 0.0;
}
let intersection = ext.intersection(&ref_set).count() as f32;
let precision = intersection / ext.len() as f32;
let recall = intersection / ref_set.len() as f32;
if precision + recall == 0.0 {
0.0
} else {
2.0 * precision * recall / (precision + recall)
}
}
pub fn calculate_text_quality(text: &str, stopwords: &HashSet<String>) -> f32 {
if text.len() < 50 {
return 0.0;
}
let mut score = 0.0;
let tokens = Self::tokenize(text);
if tokens.is_empty() {
return 0.0;
}
let stopword_ratio = Self::count_stopwords(text, stopwords) as f32 / tokens.len() as f32;
if (0.35..=0.55).contains(&stopword_ratio) {
score += 0.3;
} else {
score += 0.3 * (1.0 - (stopword_ratio - 0.45).abs() / 0.45).max(0.0);
}
let sentences = Self::split_sentences(text);
if !sentences.is_empty() {
let avg_sentence_len = tokens.len() as f32 / sentences.len() as f32;
if (12.0..=28.0).contains(&avg_sentence_len) {
score += 0.2;
} else {
score += 0.2 * (1.0 - (avg_sentence_len - 20.0).abs() / 20.0).max(0.0);
}
}
let word_count = tokens.len();
if (100..=2000).contains(&word_count) {
score += 0.2;
} else if word_count > 50 {
score += 0.1;
}
let unique_words: HashSet<_> = tokens.iter().collect();
let diversity = unique_words.len() as f32 / tokens.len() as f32;
if (0.5..=0.8).contains(&diversity) {
score += 0.15;
}
let punct_count = text.chars().filter(|c| ".,!?;:".contains(*c)).count();
let punct_density = punct_count as f32 / text.len() as f32;
if (0.02..=0.08).contains(&punct_density) {
score += 0.15;
}
score.clamp(0.0, 1.0)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_tokenize() {
let text = "Hello World! This is a test.";
let tokens = TextUtils::tokenize(text);
assert_eq!(tokens, vec!["hello", "world!", "this", "is", "a", "test."]);
}
#[test]
fn test_token_f1() {
let stopwords: HashSet<_> = vec!["the", "a", "is", "this", "with", "and"]
.into_iter()
.map(|s| s.to_string())
.collect();
let reference = "quantum computing breakthrough announced researchers laboratory";
let perfect = reference;
let partial = "quantum computing breakthrough unrelated padding";
let unrelated = "weather forecast sunny tomorrow afternoon";
let f1_perfect = TextUtils::token_f1(perfect, reference, &stopwords);
let f1_partial = TextUtils::token_f1(partial, reference, &stopwords);
let f1_unrelated = TextUtils::token_f1(unrelated, reference, &stopwords);
assert!((f1_perfect - 1.0).abs() < 1e-6, "perfect match should be 1.0, got {f1_perfect}");
assert!(f1_partial > 0.0 && f1_partial < f1_perfect);
assert!(f1_unrelated < f1_partial);
assert_eq!(TextUtils::token_f1("anything", "", &stopwords), 0.0);
}
#[test]
fn test_quality_score() {
let stopwords: HashSet<_> = vec!["the", "a", "is", "this", "with", "and"]
.into_iter()
.map(|s| s.to_string())
.collect();
let good_text = "This is a well-written article with proper structure and excellent content. \
It contains multiple sentences with appropriate punctuation and varied vocabulary. \
The writing demonstrates clear communication and informative presentation. \
Articles should have sufficient length and proper paragraph organization. \
Quality content requires thoughtful composition and careful editing.";
let score = TextUtils::calculate_text_quality(good_text, &stopwords);
println!("Quality score: {}", score);
assert!(score > 0.24, "Expected score > 0.24, got {}", score); }
}