content_extractor_rl/evaluation/
ground_truth.rs1use crate::{Result, text_utils::TextUtils};
8use serde::{Deserialize, Serialize};
9use std::path::Path;
10use std::collections::HashSet;
11#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct GroundTruthData {
16 #[serde(rename = "type")]
17 pub data_type: Option<String>,
18 pub data_key: Option<String>,
19 pub fetch_timestamp: Option<String>,
20 pub session_id: Option<String>,
21 pub mod_date: Option<String>,
22 pub title: Option<String>,
23 pub text: Option<String>,
24 #[serde(rename = "URL")]
25 pub url: Option<String>,
26 pub pub_date: Option<String>,
27 #[serde(rename = "pubDate")]
28 pub pubdate: Option<String>,
29 pub author: Option<String>,
30 #[serde(rename = "sourceName")]
31 pub source_name: Option<Vec<String>>,
32 pub language: Option<String>,
33 pub keywords: Option<Vec<String>>,
34 pub industries: Option<Vec<String>>,
35 #[serde(rename = "uniqueID")]
36 pub unique_id: Option<String>,
37 pub module: Option<String>,
38}
39
40impl GroundTruthData {
41 pub fn load(path: &Path) -> Result<Self> {
43 let json = std::fs::read_to_string(path)?;
44 let data: GroundTruthData = serde_json::from_str(&json)
45 .map_err(|e| crate::ExtractionError::ParseError(
46 format!("Failed to parse ground truth JSON: {}", e)
47 ))?;
48 Ok(data)
49 }
50
51 pub fn get_text(&self) -> &str {
53 self.text.as_deref().unwrap_or("")
54 }
55
56 pub fn get_title(&self) -> &str {
58 self.title.as_deref().unwrap_or("")
59 }
60
61 pub fn get_pubdate(&self) -> Option<&str> {
63 self.pubdate.as_deref().or(self.pub_date.as_deref())
64 }
65
66 pub fn get_url(&self) -> &str {
68 self.url.as_deref().unwrap_or("")
69 }
70
71 pub fn get_author(&self) -> Option<String> {
73 self.author.clone().or_else(|| {
74 self.source_name.as_ref().and_then(|names| {
75 if names.is_empty() {
76 None
77 } else {
78 Some(names.join(", "))
79 }
80 })
81 })
82 }
83}
84
85#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct EvaluationMetrics {
88 pub text_jaccard_similarity: f32, pub text_precision: f32, pub text_recall: f32, pub text_f1_score: f32, pub length_ratio: f32, pub length_difference: i32, pub sentence_overlap: f32, pub paragraph_overlap: f32, pub title_jaccard_similarity: f32, pub title_match_score: f32, pub text_similarity_score: f32, pub title_similarity_score: f32, pub existing_quality_score: f32, pub combined_quality: f32, }
112
113impl EvaluationMetrics {
114 pub fn calculate_combined_quality(&mut self, existing_quality: f32) {
117 self.text_similarity_score = self.text_jaccard_similarity * 0.4 +
119 self.text_f1_score * 0.4 +
120 self.sentence_overlap * 0.1 +
121 self.paragraph_overlap * 0.1;
122
123 self.title_similarity_score = self.title_jaccard_similarity * 0.5 +
125 self.title_match_score * 0.5;
126
127 self.existing_quality_score = existing_quality;
129
130 self.combined_quality = self.text_similarity_score * 0.40 +
132 self.title_similarity_score * 0.20 +
133 self.existing_quality_score * 0.40;
134
135 self.combined_quality = self.combined_quality.clamp(0.0, 1.0);
136 }
137}
138
139pub struct GroundTruthEvaluator {
141 stopwords: HashSet<String>,
142}
143
144impl GroundTruthEvaluator {
145 pub fn new(stopwords: HashSet<String>) -> Self {
147 Self { stopwords }
148 }
149
150 pub fn evaluate(
152 &self,
153 extracted_text: &str,
154 extracted_title: Option<&str>,
155 ground_truth: &GroundTruthData,
156 existing_quality: f32,
157 ) -> EvaluationMetrics {
158 let gt_text = ground_truth.get_text();
159 let gt_title = ground_truth.get_title();
160
161 let extracted_words = self.tokenize_and_normalize(extracted_text);
163 let gt_words = self.tokenize_and_normalize(gt_text);
164
165 let text_jaccard_similarity = self.calculate_jaccard_similarity(&extracted_words, >_words);
167
168 let text_precision = if extracted_words.is_empty() {
170 0.0
171 } else {
172 let intersection: HashSet<_> = extracted_words.intersection(>_words).collect();
173 intersection.len() as f32 / extracted_words.len() as f32
174 };
175
176 let text_recall = if gt_words.is_empty() {
177 0.0
178 } else {
179 let intersection: HashSet<_> = extracted_words.intersection(>_words).collect();
180 intersection.len() as f32 / gt_words.len() as f32
181 };
182
183 let text_f1_score = if text_precision + text_recall == 0.0 {
185 0.0
186 } else {
187 2.0 * text_precision * text_recall / (text_precision + text_recall)
188 };
189
190 let extracted_len = extracted_text.len();
192 let gt_len = gt_text.len();
193 let length_ratio = if gt_len == 0 {
194 0.0
195 } else {
196 extracted_len as f32 / gt_len as f32
197 };
198 let length_difference = (extracted_len as i32 - gt_len as i32).abs();
199
200 let extracted_sentences = TextUtils::split_sentences(extracted_text);
202 let gt_sentences = TextUtils::split_sentences(gt_text);
203 let sentence_overlap = if gt_sentences.is_empty() {
204 0.0
205 } else {
206 (extracted_sentences.len().min(gt_sentences.len()) as f32) /
207 (gt_sentences.len() as f32)
208 };
209
210 let extracted_paragraphs = extracted_text.split("\n\n").filter(|p| !p.trim().is_empty()).count();
211 let gt_paragraphs = gt_text.split("\n\n").filter(|p| !p.trim().is_empty()).count();
212 let paragraph_overlap = if gt_paragraphs == 0 {
213 0.0
214 } else {
215 (extracted_paragraphs.min(gt_paragraphs) as f32) / (gt_paragraphs as f32)
216 };
217
218 let (title_jaccard_similarity, title_match_score) = if let Some(ext_title) = extracted_title {
220 self.calculate_title_metrics(ext_title, gt_title)
221 } else {
222 (0.0, 0.0)
223 };
224
225 let mut metrics = EvaluationMetrics {
227 text_jaccard_similarity,
228 text_precision,
229 text_recall,
230 text_f1_score,
231 length_ratio,
232 length_difference,
233 sentence_overlap,
234 paragraph_overlap,
235 title_jaccard_similarity,
236 title_match_score,
237 text_similarity_score: 0.0,
238 title_similarity_score: 0.0,
239 existing_quality_score: existing_quality,
240 combined_quality: 0.0,
241 };
242
243 metrics.calculate_combined_quality(existing_quality);
244
245 metrics
246 }
247
248 fn calculate_jaccard_similarity(&self, set1: &HashSet<String>, set2: &HashSet<String>) -> f32 {
250 if set1.is_empty() && set2.is_empty() {
251 return 1.0;
252 }
253
254 let intersection: HashSet<_> = set1.intersection(set2).collect();
255 let union: HashSet<_> = set1.union(set2).collect();
256
257 if union.is_empty() {
258 0.0
259 } else {
260 intersection.len() as f32 / union.len() as f32
261 }
262 }
263
264 fn tokenize_and_normalize(&self, text: &str) -> HashSet<String> {
266 TextUtils::tokenize(text)
267 .into_iter()
268 .filter(|word| !self.stopwords.contains(word) && word.len() > 2)
269 .collect()
270 }
271
272 fn calculate_title_metrics(&self, extracted: &str, ground_truth: &str) -> (f32, f32) {
274 if ground_truth.is_empty() {
275 return (0.5, 0.5); }
277
278 if extracted.is_empty() {
279 return (0.0, 0.0);
280 }
281
282 let extracted_words: HashSet<_> = TextUtils::tokenize(extracted)
284 .into_iter()
285 .filter(|w| w.len() > 2)
286 .collect();
287 let gt_words: HashSet<_> = TextUtils::tokenize(ground_truth)
288 .into_iter()
289 .filter(|w| w.len() > 2)
290 .collect();
291
292 if gt_words.is_empty() {
293 return (0.5, 0.5);
294 }
295
296 let jaccard = self.calculate_jaccard_similarity(&extracted_words, >_words);
298
299 let intersection = extracted_words.intersection(>_words).count();
301 let recall = intersection as f32 / gt_words.len() as f32;
302 let precision = if extracted_words.is_empty() {
303 0.0
304 } else {
305 intersection as f32 / extracted_words.len() as f32
306 };
307
308 let f1_score = if recall + precision == 0.0 {
309 0.0
310 } else {
311 2.0 * recall * precision / (recall + precision)
312 };
313
314 (jaccard, f1_score)
315 }
316
317 pub fn evaluate_batch(
319 &self,
320 extractions: Vec<(String, Option<String>, &GroundTruthData, f32)>,
321 ) -> Vec<EvaluationMetrics> {
322 extractions
323 .into_iter()
324 .map(|(text, title, gt, quality)| {
325 self.evaluate(&text, title.as_deref(), gt, quality)
326 })
327 .collect()
328 }
329
330 pub fn average_metrics(metrics: &[EvaluationMetrics]) -> EvaluationMetrics {
332 if metrics.is_empty() {
333 return EvaluationMetrics {
334 text_jaccard_similarity: 0.0,
335 text_precision: 0.0,
336 text_recall: 0.0,
337 text_f1_score: 0.0,
338 length_ratio: 0.0,
339 length_difference: 0,
340 sentence_overlap: 0.0,
341 paragraph_overlap: 0.0,
342 title_jaccard_similarity: 0.0,
343 title_match_score: 0.0,
344 text_similarity_score: 0.0,
345 title_similarity_score: 0.0,
346 existing_quality_score: 0.0,
347 combined_quality: 0.0,
348 };
349 }
350
351 let n = metrics.len() as f32;
352
353 EvaluationMetrics {
354 text_jaccard_similarity: metrics.iter().map(|m| m.text_jaccard_similarity).sum::<f32>() / n,
355 text_precision: metrics.iter().map(|m| m.text_precision).sum::<f32>() / n,
356 text_recall: metrics.iter().map(|m| m.text_recall).sum::<f32>() / n,
357 text_f1_score: metrics.iter().map(|m| m.text_f1_score).sum::<f32>() / n,
358 length_ratio: metrics.iter().map(|m| m.length_ratio).sum::<f32>() / n,
359 length_difference: (metrics.iter().map(|m| m.length_difference).sum::<i32>() as f32 / n) as i32,
360 sentence_overlap: metrics.iter().map(|m| m.sentence_overlap).sum::<f32>() / n,
361 paragraph_overlap: metrics.iter().map(|m| m.paragraph_overlap).sum::<f32>() / n,
362 title_jaccard_similarity: metrics.iter().map(|m| m.title_jaccard_similarity).sum::<f32>() / n,
363 title_match_score: metrics.iter().map(|m| m.title_match_score).sum::<f32>() / n,
364 text_similarity_score: metrics.iter().map(|m| m.text_similarity_score).sum::<f32>() / n,
365 title_similarity_score: metrics.iter().map(|m| m.title_similarity_score).sum::<f32>() / n,
366 existing_quality_score: metrics.iter().map(|m| m.existing_quality_score).sum::<f32>() / n,
367 combined_quality: metrics.iter().map(|m| m.combined_quality).sum::<f32>() / n,
368 }
369 }
370}
371
372#[cfg(test)]
373mod tests {
374 use super::*;
375
376 #[test]
377 fn test_evaluation() {
378 let stopwords: HashSet<String> = vec!["the", "a", "is"]
379 .into_iter()
380 .map(|s| s.to_string())
381 .collect();
382
383 let evaluator = GroundTruthEvaluator::new(stopwords);
384
385 let gt = GroundTruthData {
386 data_type: Some("news".to_string()),
387 data_key: None,
388 fetch_timestamp: None,
389 session_id: None,
390 mod_date: None,
391 title: Some("Test Article Title".to_string()),
392 text: Some("This is the ground truth article text with several sentences. It contains important information.".to_string()),
393 url: Some("https://example.com/article".to_string()),
394 pub_date: None,
395 pubdate: Some("2025-01-01".to_string()),
396 author: None,
397 source_name: None,
398 language: Some("en".to_string()),
399 keywords: None,
400 industries: None,
401 unique_id: None,
402 module: None,
403 };
404
405 let extracted = "This is the extracted article text with several sentences.";
406 let title = Some("Test Article");
407
408 let metrics = evaluator.evaluate(extracted, title, >, 0.8);
409
410 assert!(metrics.text_f1_score > 0.0);
411 assert!(metrics.combined_quality > 0.0);
412 assert!(metrics.title_match_score > 0.0);
413 assert_eq!(metrics.existing_quality_score, 0.8);
414 }
415
416 #[test]
417 fn test_jaccard_similarity() {
418 let stopwords: HashSet<String> = HashSet::new();
419 let evaluator = GroundTruthEvaluator::new(stopwords);
420
421 let set1: HashSet<String> = vec!["hello", "world"]
422 .into_iter()
423 .map(|s| s.to_string())
424 .collect();
425 let set2: HashSet<String> = vec!["hello", "world", "test"]
426 .into_iter()
427 .map(|s| s.to_string())
428 .collect();
429
430 let similarity = evaluator.calculate_jaccard_similarity(&set1, &set2);
431 assert!((similarity - 0.666).abs() < 0.01);
432 }
433}