content_extractor_rl/
text_utils.rs1use std::collections::HashSet;
7
8pub struct TextUtils;
10
11impl TextUtils {
12 pub fn tokenize(text: &str) -> Vec<String> {
14 text.to_lowercase()
15 .split_whitespace()
16 .map(|s| s.to_string())
17 .collect()
18 }
19
20 pub fn count_stopwords(text: &str, stopwords: &HashSet<String>) -> usize {
22 Self::tokenize(text)
23 .iter()
24 .filter(|token| stopwords.contains(*token))
25 .count()
26 }
27
28 pub fn stopword_density(text: &str, stopwords: &HashSet<String>) -> f32 {
30 let tokens = Self::tokenize(text);
31 if tokens.is_empty() {
32 return 0.0;
33 }
34
35 let stopword_count = tokens.iter()
36 .filter(|token| stopwords.contains(*token))
37 .count();
38
39 stopword_count as f32 / tokens.len() as f32
40 }
41
42 pub fn split_sentences(text: &str) -> Vec<String> {
44 use regex::Regex;
45 lazy_static::lazy_static! {
46 static ref SENTENCE_RE: Regex = Regex::new(r"[.!?]+").unwrap();
47 }
48
49 SENTENCE_RE
50 .split(text)
51 .map(|s| s.trim().to_string())
52 .filter(|s| !s.is_empty())
53 .collect()
54 }
55
56 pub fn token_f1(extracted: &str, reference: &str, stopwords: &HashSet<String>) -> f32 {
64 let norm = |t: &str| -> HashSet<String> {
65 Self::tokenize(t)
66 .into_iter()
67 .filter(|w| w.len() > 2 && !stopwords.contains(w))
68 .collect()
69 };
70
71 let ext = norm(extracted);
72 let ref_set = norm(reference);
73
74 if ref_set.is_empty() || ext.is_empty() {
75 return 0.0;
76 }
77
78 let intersection = ext.intersection(&ref_set).count() as f32;
79 let precision = intersection / ext.len() as f32;
80 let recall = intersection / ref_set.len() as f32;
81
82 if precision + recall == 0.0 {
83 0.0
84 } else {
85 2.0 * precision * recall / (precision + recall)
86 }
87 }
88
89 pub fn calculate_text_quality(text: &str, stopwords: &HashSet<String>) -> f32 {
91 if text.len() < 50 {
92 return 0.0;
93 }
94
95 let mut score = 0.0;
96 let tokens = Self::tokenize(text);
97
98 if tokens.is_empty() {
99 return 0.0;
100 }
101
102 let stopword_ratio = Self::count_stopwords(text, stopwords) as f32 / tokens.len() as f32;
104 if (0.35..=0.55).contains(&stopword_ratio) {
105 score += 0.3;
106 } else {
107 score += 0.3 * (1.0 - (stopword_ratio - 0.45).abs() / 0.45).max(0.0);
108 }
109
110 let sentences = Self::split_sentences(text);
112 if !sentences.is_empty() {
113 let avg_sentence_len = tokens.len() as f32 / sentences.len() as f32;
114 if (12.0..=28.0).contains(&avg_sentence_len) {
115 score += 0.2;
116 } else {
117 score += 0.2 * (1.0 - (avg_sentence_len - 20.0).abs() / 20.0).max(0.0);
118 }
119 }
120
121 let word_count = tokens.len();
123 if (100..=2000).contains(&word_count) {
124 score += 0.2;
125 } else if word_count > 50 {
126 score += 0.1;
127 }
128
129 let unique_words: HashSet<_> = tokens.iter().collect();
131 let diversity = unique_words.len() as f32 / tokens.len() as f32;
132 if (0.5..=0.8).contains(&diversity) {
133 score += 0.15;
134 }
135
136 let punct_count = text.chars().filter(|c| ".,!?;:".contains(*c)).count();
138 let punct_density = punct_count as f32 / text.len() as f32;
139 if (0.02..=0.08).contains(&punct_density) {
140 score += 0.15;
141 }
142
143 score.clamp(0.0, 1.0)
144 }
145}
146
147#[cfg(test)]
148mod tests {
149 use super::*;
150
151 #[test]
152 fn test_tokenize() {
153 let text = "Hello World! This is a test.";
154 let tokens = TextUtils::tokenize(text);
155 assert_eq!(tokens, vec!["hello", "world!", "this", "is", "a", "test."]);
156 }
157
158 #[test]
159 fn test_token_f1() {
160 let stopwords: HashSet<_> = vec!["the", "a", "is", "this", "with", "and"]
161 .into_iter()
162 .map(|s| s.to_string())
163 .collect();
164
165 let reference = "quantum computing breakthrough announced researchers laboratory";
166 let perfect = reference;
167 let partial = "quantum computing breakthrough unrelated padding";
168 let unrelated = "weather forecast sunny tomorrow afternoon";
169
170 let f1_perfect = TextUtils::token_f1(perfect, reference, &stopwords);
171 let f1_partial = TextUtils::token_f1(partial, reference, &stopwords);
172 let f1_unrelated = TextUtils::token_f1(unrelated, reference, &stopwords);
173
174 assert!((f1_perfect - 1.0).abs() < 1e-6, "perfect match should be 1.0, got {f1_perfect}");
175 assert!(f1_partial > 0.0 && f1_partial < f1_perfect);
176 assert!(f1_unrelated < f1_partial);
177 assert_eq!(TextUtils::token_f1("anything", "", &stopwords), 0.0);
178 }
179
180 #[test]
181 fn test_quality_score() {
182 let stopwords: HashSet<_> = vec!["the", "a", "is", "this", "with", "and"]
183 .into_iter()
184 .map(|s| s.to_string())
185 .collect();
186
187 let good_text = "This is a well-written article with proper structure and excellent content. \
189 It contains multiple sentences with appropriate punctuation and varied vocabulary. \
190 The writing demonstrates clear communication and informative presentation. \
191 Articles should have sufficient length and proper paragraph organization. \
192 Quality content requires thoughtful composition and careful editing.";
193
194 let score = TextUtils::calculate_text_quality(good_text, &stopwords);
195 println!("Quality score: {}", score);
196 assert!(score > 0.24, "Expected score > 0.24, got {}", score); }
198}