content_extractor_rl/
text_utils.rs1use std::collections::HashSet;
7
8pub struct TextUtils;
10
11impl TextUtils {
12 pub fn tokenize(text: &str) -> Vec<String> {
14 text.to_lowercase()
15 .split_whitespace()
16 .map(|s| s.to_string())
17 .collect()
18 }
19
20 pub fn count_stopwords(text: &str, stopwords: &HashSet<String>) -> usize {
22 Self::tokenize(text)
23 .iter()
24 .filter(|token| stopwords.contains(*token))
25 .count()
26 }
27
28 pub fn stopword_density(text: &str, stopwords: &HashSet<String>) -> f32 {
30 let tokens = Self::tokenize(text);
31 if tokens.is_empty() {
32 return 0.0;
33 }
34
35 let stopword_count = tokens.iter()
36 .filter(|token| stopwords.contains(*token))
37 .count();
38
39 stopword_count as f32 / tokens.len() as f32
40 }
41
42 pub fn split_sentences(text: &str) -> Vec<String> {
44 use regex::Regex;
45 lazy_static::lazy_static! {
46 static ref SENTENCE_RE: Regex = Regex::new(r"[.!?]+").unwrap();
47 }
48
49 SENTENCE_RE
50 .split(text)
51 .map(|s| s.trim().to_string())
52 .filter(|s| !s.is_empty())
53 .collect()
54 }
55
56 pub fn calculate_text_quality(text: &str, stopwords: &HashSet<String>) -> f32 {
58 if text.len() < 50 {
59 return 0.0;
60 }
61
62 let mut score = 0.0;
63 let tokens = Self::tokenize(text);
64
65 if tokens.is_empty() {
66 return 0.0;
67 }
68
69 let stopword_ratio = Self::count_stopwords(text, stopwords) as f32 / tokens.len() as f32;
71 if (0.35..=0.55).contains(&stopword_ratio) {
72 score += 0.3;
73 } else {
74 score += 0.3 * (1.0 - (stopword_ratio - 0.45).abs() / 0.45).max(0.0);
75 }
76
77 let sentences = Self::split_sentences(text);
79 if !sentences.is_empty() {
80 let avg_sentence_len = tokens.len() as f32 / sentences.len() as f32;
81 if (12.0..=28.0).contains(&avg_sentence_len) {
82 score += 0.2;
83 } else {
84 score += 0.2 * (1.0 - (avg_sentence_len - 20.0).abs() / 20.0).max(0.0);
85 }
86 }
87
88 let word_count = tokens.len();
90 if (100..=2000).contains(&word_count) {
91 score += 0.2;
92 } else if word_count > 50 {
93 score += 0.1;
94 }
95
96 let unique_words: HashSet<_> = tokens.iter().collect();
98 let diversity = unique_words.len() as f32 / tokens.len() as f32;
99 if (0.5..=0.8).contains(&diversity) {
100 score += 0.15;
101 }
102
103 let punct_count = text.chars().filter(|c| ".,!?;:".contains(*c)).count();
105 let punct_density = punct_count as f32 / text.len() as f32;
106 if (0.02..=0.08).contains(&punct_density) {
107 score += 0.15;
108 }
109
110 score.clamp(0.0, 1.0)
111 }
112}
113
114#[cfg(test)]
115mod tests {
116 use super::*;
117
118 #[test]
119 fn test_tokenize() {
120 let text = "Hello World! This is a test.";
121 let tokens = TextUtils::tokenize(text);
122 assert_eq!(tokens, vec!["hello", "world!", "this", "is", "a", "test."]);
123 }
124
125 #[test]
126 fn test_quality_score() {
127 let stopwords: HashSet<_> = vec!["the", "a", "is", "this", "with", "and"]
128 .into_iter()
129 .map(|s| s.to_string())
130 .collect();
131
132 let good_text = "This is a well-written article with proper structure and excellent content. \
134 It contains multiple sentences with appropriate punctuation and varied vocabulary. \
135 The writing demonstrates clear communication and informative presentation. \
136 Articles should have sufficient length and proper paragraph organization. \
137 Quality content requires thoughtful composition and careful editing.";
138
139 let score = TextUtils::calculate_text_quality(good_text, &stopwords);
140 println!("Quality score: {}", score);
141 assert!(score > 0.24, "Expected score > 0.24, got {}", score); }
143}