Skip to main content

content_extractor_rl/evaluation/
ground_truth.rs

1//! Enhanced evaluation against ground truth data from pre-extracted JSON files
2// ============================================================================
3// FILE: crates/content-extractor-rl/src/ground_truth.rs
4// ============================================================================
5
6
7use crate::{Result, text_utils::TextUtils};
8use serde::{Deserialize, Serialize};
9use std::path::Path;
10use std::collections::HashSet;
11// use tracing::{info, warn};
12
13/// Ground truth data from pre-extracted JSON
14#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct GroundTruthData {
16    #[serde(rename = "type")]
17    pub data_type: Option<String>,
18    pub data_key: Option<String>,
19    pub fetch_timestamp: Option<String>,
20    pub session_id: Option<String>,
21    pub mod_date: Option<String>,
22    pub title: Option<String>,
23    pub text: Option<String>,
24    #[serde(rename = "URL")]
25    pub url: Option<String>,
26    pub pub_date: Option<String>,
27    #[serde(rename = "pubDate")]
28    pub pubdate: Option<String>,
29    pub author: Option<String>,
30    #[serde(rename = "sourceName")]
31    pub source_name: Option<Vec<String>>,
32    pub language: Option<String>,
33    pub keywords: Option<Vec<String>>,
34    pub industries: Option<Vec<String>>,
35    #[serde(rename = "uniqueID")]
36    pub unique_id: Option<String>,
37    pub module: Option<String>,
38}
39
40impl GroundTruthData {
41    /// Load from JSON file
42    pub fn load(path: &Path) -> Result<Self> {
43        let json = std::fs::read_to_string(path)?;
44        let data: GroundTruthData = serde_json::from_str(&json)
45            .map_err(|e| crate::ExtractionError::ParseError(
46                format!("Failed to parse ground truth JSON: {}", e)
47            ))?;
48        Ok(data)
49    }
50
51    /// Get the ground truth text
52    pub fn get_text(&self) -> &str {
53        self.text.as_deref().unwrap_or("")
54    }
55
56    /// Get the ground truth title
57    pub fn get_title(&self) -> &str {
58        self.title.as_deref().unwrap_or("")
59    }
60
61    /// Get the publication date (handles both variants)
62    pub fn get_pubdate(&self) -> Option<&str> {
63        self.pubdate.as_deref().or(self.pub_date.as_deref())
64    }
65
66    /// Get the URL
67    pub fn get_url(&self) -> &str {
68        self.url.as_deref().unwrap_or("")
69    }
70
71    /// Get author name(s)
72    pub fn get_author(&self) -> Option<String> {
73        self.author.clone().or_else(|| {
74            self.source_name.as_ref().and_then(|names| {
75                if names.is_empty() {
76                    None
77                } else {
78                    Some(names.join(", "))
79                }
80            })
81        })
82    }
83}
84
85/// Evaluation metrics comparing extracted vs ground truth
86#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct EvaluationMetrics {
88    // Text similarity metrics
89    pub text_jaccard_similarity: f32,  // Jaccard similarity of words
90    pub text_precision: f32,            // % of extracted words in ground truth
91    pub text_recall: f32,               // % of ground truth words in extracted
92    pub text_f1_score: f32,             // Harmonic mean of precision/recall
93
94    // Length-based metrics
95    pub length_ratio: f32,              // extracted_len / ground_truth_len
96    pub length_difference: i32,         // Absolute difference in characters
97
98    // Semantic metrics
99    pub sentence_overlap: f32,          // Overlap in sentence count
100    pub paragraph_overlap: f32,         // Overlap in paragraph structure
101
102    // Title matching
103    pub title_jaccard_similarity: f32,  // Jaccard similarity for title
104    pub title_match_score: f32,         // Overall title match (0-1)
105
106    // Combined quality metrics
107    pub text_similarity_score: f32,     // Weighted text similarity (40%)
108    pub title_similarity_score: f32,    // Weighted title similarity (20%)
109    pub existing_quality_score: f32,    // Existing quality metrics (40%)
110    pub combined_quality: f32,          // Final weighted combination
111}
112
113impl EvaluationMetrics {
114    /// Calculate combined quality score with proper weighting
115    /// Text similarity: 40%, Title match: 20%, Existing quality: 40%
116    pub fn calculate_combined_quality(&mut self, existing_quality: f32) {
117        // Text similarity component (40%)
118        self.text_similarity_score = self.text_jaccard_similarity * 0.4 +
119                self.text_f1_score * 0.4 +
120                self.sentence_overlap * 0.1 +
121                self.paragraph_overlap * 0.1;
122
123        // Title similarity component (20%)
124        self.title_similarity_score = self.title_jaccard_similarity * 0.5 +
125                self.title_match_score * 0.5;
126
127        // Existing quality component (40%)
128        self.existing_quality_score = existing_quality;
129
130        // Final weighted combination
131        self.combined_quality = self.text_similarity_score * 0.40 +
132                self.title_similarity_score * 0.20 +
133                self.existing_quality_score * 0.40;
134
135        self.combined_quality = self.combined_quality.clamp(0.0, 1.0);
136    }
137}
138
139/// Evaluator for comparing extracted text against ground truth
140pub struct GroundTruthEvaluator {
141    stopwords: HashSet<String>,
142}
143
144impl GroundTruthEvaluator {
145    /// Create new evaluator
146    pub fn new(stopwords: HashSet<String>) -> Self {
147        Self { stopwords }
148    }
149
150    /// Evaluate extracted text against ground truth
151    pub fn evaluate(
152        &self,
153        extracted_text: &str,
154        extracted_title: Option<&str>,
155        ground_truth: &GroundTruthData,
156        existing_quality: f32,
157    ) -> EvaluationMetrics {
158        let gt_text = ground_truth.get_text();
159        let gt_title = ground_truth.get_title();
160
161        // Tokenize texts
162        let extracted_words = self.tokenize_and_normalize(extracted_text);
163        let gt_words = self.tokenize_and_normalize(gt_text);
164
165        // Calculate text Jaccard similarity
166        let text_jaccard_similarity = self.calculate_jaccard_similarity(&extracted_words, &gt_words);
167
168        // Calculate precision and recall
169        let text_precision = if extracted_words.is_empty() {
170            0.0
171        } else {
172            let intersection: HashSet<_> = extracted_words.intersection(&gt_words).collect();
173            intersection.len() as f32 / extracted_words.len() as f32
174        };
175
176        let text_recall = if gt_words.is_empty() {
177            0.0
178        } else {
179            let intersection: HashSet<_> = extracted_words.intersection(&gt_words).collect();
180            intersection.len() as f32 / gt_words.len() as f32
181        };
182
183        // Calculate F1 score
184        let text_f1_score = if text_precision + text_recall == 0.0 {
185            0.0
186        } else {
187            2.0 * text_precision * text_recall / (text_precision + text_recall)
188        };
189
190        // Length metrics
191        let extracted_len = extracted_text.len();
192        let gt_len = gt_text.len();
193        let length_ratio = if gt_len == 0 {
194            0.0
195        } else {
196            extracted_len as f32 / gt_len as f32
197        };
198        let length_difference = (extracted_len as i32 - gt_len as i32).abs();
199
200        // Sentence and paragraph overlap
201        let extracted_sentences = TextUtils::split_sentences(extracted_text);
202        let gt_sentences = TextUtils::split_sentences(gt_text);
203        let sentence_overlap = if gt_sentences.is_empty() {
204            0.0
205        } else {
206            (extracted_sentences.len().min(gt_sentences.len()) as f32) /
207                (gt_sentences.len() as f32)
208        };
209
210        let extracted_paragraphs = extracted_text.split("\n\n").filter(|p| !p.trim().is_empty()).count();
211        let gt_paragraphs = gt_text.split("\n\n").filter(|p| !p.trim().is_empty()).count();
212        let paragraph_overlap = if gt_paragraphs == 0 {
213            0.0
214        } else {
215            (extracted_paragraphs.min(gt_paragraphs) as f32) / (gt_paragraphs as f32)
216        };
217
218        // Title matching
219        let (title_jaccard_similarity, title_match_score) = if let Some(ext_title) = extracted_title {
220            self.calculate_title_metrics(ext_title, gt_title)
221        } else {
222            (0.0, 0.0)
223        };
224
225        // Create metrics
226        let mut metrics = EvaluationMetrics {
227            text_jaccard_similarity,
228            text_precision,
229            text_recall,
230            text_f1_score,
231            length_ratio,
232            length_difference,
233            sentence_overlap,
234            paragraph_overlap,
235            title_jaccard_similarity,
236            title_match_score,
237            text_similarity_score: 0.0,
238            title_similarity_score: 0.0,
239            existing_quality_score: existing_quality,
240            combined_quality: 0.0,
241        };
242
243        metrics.calculate_combined_quality(existing_quality);
244
245        metrics
246    }
247
248    /// Calculate Jaccard similarity between two sets
249    fn calculate_jaccard_similarity(&self, set1: &HashSet<String>, set2: &HashSet<String>) -> f32 {
250        if set1.is_empty() && set2.is_empty() {
251            return 1.0;
252        }
253
254        let intersection: HashSet<_> = set1.intersection(set2).collect();
255        let union: HashSet<_> = set1.union(set2).collect();
256
257        if union.is_empty() {
258            0.0
259        } else {
260            intersection.len() as f32 / union.len() as f32
261        }
262    }
263
264    /// Tokenize and normalize text (lowercase, remove stopwords)
265    fn tokenize_and_normalize(&self, text: &str) -> HashSet<String> {
266        TextUtils::tokenize(text)
267            .into_iter()
268            .filter(|word| !self.stopwords.contains(word) && word.len() > 2)
269            .collect()
270    }
271
272    /// Calculate title metrics (Jaccard similarity and match score)
273    fn calculate_title_metrics(&self, extracted: &str, ground_truth: &str) -> (f32, f32) {
274        if ground_truth.is_empty() {
275            return (0.5, 0.5); // No ground truth to compare
276        }
277
278        if extracted.is_empty() {
279            return (0.0, 0.0);
280        }
281
282        // Tokenize titles
283        let extracted_words: HashSet<_> = TextUtils::tokenize(extracted)
284            .into_iter()
285            .filter(|w| w.len() > 2)
286            .collect();
287        let gt_words: HashSet<_> = TextUtils::tokenize(ground_truth)
288            .into_iter()
289            .filter(|w| w.len() > 2)
290            .collect();
291
292        if gt_words.is_empty() {
293            return (0.5, 0.5);
294        }
295
296        // Jaccard similarity
297        let jaccard = self.calculate_jaccard_similarity(&extracted_words, &gt_words);
298
299        // Calculate F1 score for title
300        let intersection = extracted_words.intersection(&gt_words).count();
301        let recall = intersection as f32 / gt_words.len() as f32;
302        let precision = if extracted_words.is_empty() {
303            0.0
304        } else {
305            intersection as f32 / extracted_words.len() as f32
306        };
307
308        let f1_score = if recall + precision == 0.0 {
309            0.0
310        } else {
311            2.0 * recall * precision / (recall + precision)
312        };
313
314        (jaccard, f1_score)
315    }
316
317    /// Evaluate batch of extractions
318    pub fn evaluate_batch(
319        &self,
320        extractions: Vec<(String, Option<String>, &GroundTruthData, f32)>,
321    ) -> Vec<EvaluationMetrics> {
322        extractions
323            .into_iter()
324            .map(|(text, title, gt, quality)| {
325                self.evaluate(&text, title.as_deref(), gt, quality)
326            })
327            .collect()
328    }
329
330    /// Calculate average metrics across batch
331    pub fn average_metrics(metrics: &[EvaluationMetrics]) -> EvaluationMetrics {
332        if metrics.is_empty() {
333            return EvaluationMetrics {
334                text_jaccard_similarity: 0.0,
335                text_precision: 0.0,
336                text_recall: 0.0,
337                text_f1_score: 0.0,
338                length_ratio: 0.0,
339                length_difference: 0,
340                sentence_overlap: 0.0,
341                paragraph_overlap: 0.0,
342                title_jaccard_similarity: 0.0,
343                title_match_score: 0.0,
344                text_similarity_score: 0.0,
345                title_similarity_score: 0.0,
346                existing_quality_score: 0.0,
347                combined_quality: 0.0,
348            };
349        }
350
351        let n = metrics.len() as f32;
352
353        EvaluationMetrics {
354            text_jaccard_similarity: metrics.iter().map(|m| m.text_jaccard_similarity).sum::<f32>() / n,
355            text_precision: metrics.iter().map(|m| m.text_precision).sum::<f32>() / n,
356            text_recall: metrics.iter().map(|m| m.text_recall).sum::<f32>() / n,
357            text_f1_score: metrics.iter().map(|m| m.text_f1_score).sum::<f32>() / n,
358            length_ratio: metrics.iter().map(|m| m.length_ratio).sum::<f32>() / n,
359            length_difference: (metrics.iter().map(|m| m.length_difference).sum::<i32>() as f32 / n) as i32,
360            sentence_overlap: metrics.iter().map(|m| m.sentence_overlap).sum::<f32>() / n,
361            paragraph_overlap: metrics.iter().map(|m| m.paragraph_overlap).sum::<f32>() / n,
362            title_jaccard_similarity: metrics.iter().map(|m| m.title_jaccard_similarity).sum::<f32>() / n,
363            title_match_score: metrics.iter().map(|m| m.title_match_score).sum::<f32>() / n,
364            text_similarity_score: metrics.iter().map(|m| m.text_similarity_score).sum::<f32>() / n,
365            title_similarity_score: metrics.iter().map(|m| m.title_similarity_score).sum::<f32>() / n,
366            existing_quality_score: metrics.iter().map(|m| m.existing_quality_score).sum::<f32>() / n,
367            combined_quality: metrics.iter().map(|m| m.combined_quality).sum::<f32>() / n,
368        }
369    }
370}
371
372#[cfg(test)]
373mod tests {
374    use super::*;
375
376    #[test]
377    fn test_evaluation() {
378        let stopwords: HashSet<String> = vec!["the", "a", "is"]
379            .into_iter()
380            .map(|s| s.to_string())
381            .collect();
382
383        let evaluator = GroundTruthEvaluator::new(stopwords);
384
385        let gt = GroundTruthData {
386            data_type: Some("news".to_string()),
387            data_key: None,
388            fetch_timestamp: None,
389            session_id: None,
390            mod_date: None,
391            title: Some("Test Article Title".to_string()),
392            text: Some("This is the ground truth article text with several sentences. It contains important information.".to_string()),
393            url: Some("https://example.com/article".to_string()),
394            pub_date: None,
395            pubdate: Some("2025-01-01".to_string()),
396            author: None,
397            source_name: None,
398            language: Some("en".to_string()),
399            keywords: None,
400            industries: None,
401            unique_id: None,
402            module: None,
403        };
404
405        let extracted = "This is the extracted article text with several sentences.";
406        let title = Some("Test Article");
407
408        let metrics = evaluator.evaluate(extracted, title, &gt, 0.8);
409
410        assert!(metrics.text_f1_score > 0.0);
411        assert!(metrics.combined_quality > 0.0);
412        assert!(metrics.title_match_score > 0.0);
413        assert_eq!(metrics.existing_quality_score, 0.8);
414    }
415
416    #[test]
417    fn test_jaccard_similarity() {
418        let stopwords: HashSet<String> = HashSet::new();
419        let evaluator = GroundTruthEvaluator::new(stopwords);
420
421        let set1: HashSet<String> = vec!["hello", "world"]
422            .into_iter()
423            .map(|s| s.to_string())
424            .collect();
425        let set2: HashSet<String> = vec!["hello", "world", "test"]
426            .into_iter()
427            .map(|s| s.to_string())
428            .collect();
429
430        let similarity = evaluator.calculate_jaccard_similarity(&set1, &set2);
431        assert!((similarity - 0.666).abs() < 0.01);
432    }
433}