Skip to main content

content_extractor_rl/
site_profile.rs

1// ============================================================================
2// FILE: crates/content-extractor-rl/src/site_profile.rs
3// ============================================================================
4
5
6use serde::{Deserialize, Serialize};
7use chrono::{DateTime, Utc};
8use std::collections::HashMap;
9use std::path::{Path, PathBuf};
10use crate::Result;
11use sha2::Digest;
12
13/// Site profile storing historical extraction patterns
14#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct SiteProfile {
16    pub domain: String,
17    pub extractions: Vec<ExtractionRecord>,
18    pub successful_xpaths: HashMap<String, usize>,
19    pub avg_parameters: HashMap<String, Vec<f64>>,
20    pub quality_scores: Vec<f32>,
21    pub last_updated: DateTime<Utc>,
22}
23
24#[derive(Debug, Clone, Serialize, Deserialize)]
25pub struct ExtractionRecord {
26    pub timestamp: DateTime<Utc>,
27    pub quality_score: f32,
28    pub xpath: String,
29    pub parameters: HashMap<String, f64>,
30    pub text_length: usize,
31}
32
33impl SiteProfile {
34    /// Create new site profile
35    pub fn new(domain: String) -> Self {
36        Self {
37            domain,
38            extractions: Vec::new(),
39            successful_xpaths: HashMap::new(),
40            avg_parameters: HashMap::new(),
41            quality_scores: Vec::new(),
42            last_updated: Utc::now(),
43        }
44    }
45
46    /// Add extraction result to profile
47    pub fn add_extraction(&mut self, result: ExtractionResult) {
48        let record = ExtractionRecord {
49            timestamp: Utc::now(),
50            quality_score: result.quality_score,
51            xpath: result.xpath.clone(),
52            parameters: result.parameters.clone(),
53            text_length: result.text.len(),
54        };
55
56        self.extractions.push(record);
57
58        // Update statistics for successful extractions
59        if result.quality_score > 0.7 {
60            if !result.xpath.is_empty() {
61                *self.successful_xpaths.entry(result.xpath.clone()).or_insert(0) += 1;
62            }
63
64            for (key, value) in result.parameters.iter() {
65                self.avg_parameters.entry(key.clone())
66                    .or_default()
67                    .push(*value);
68            }
69        }
70
71        self.quality_scores.push(result.quality_score);
72        self.last_updated = Utc::now();
73
74        // Keep only recent extractions (last 1000)
75        if self.extractions.len() > 1000 {
76            self.extractions = self.extractions.split_off(self.extractions.len() - 1000);
77        }
78    }
79
80    /// Get most successful XPath pattern
81    pub fn get_best_xpath(&self) -> Option<&String> {
82        self.successful_xpaths.iter()
83            .max_by_key(|(_, count)| *count)
84            .map(|(xpath, _)| xpath)
85    }
86
87    /// Get recommended parameters (median values)
88    pub fn get_recommended_parameters(&self) -> HashMap<String, f64> {
89        let mut recommended = HashMap::new();
90
91        for (param, values) in self.avg_parameters.iter() {
92            if !values.is_empty() {
93                let mut sorted = values.clone();
94                sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
95                let median = sorted[sorted.len() / 2];
96                recommended.insert(param.clone(), median);
97            }
98        }
99
100        recommended
101    }
102
103    /// Calculate success rate
104    pub fn get_success_rate(&self) -> f32 {
105        if self.quality_scores.is_empty() {
106            return 0.0;
107        }
108
109        let successful = self.quality_scores.iter()
110            .filter(|&&score| score > 0.7)
111            .count();
112
113        successful as f32 / self.quality_scores.len() as f32
114    }
115
116    /// Save profile to file
117    pub fn save<P: AsRef<Path>>(&self, path: P) -> Result<()> {
118        let json = serde_json::to_string_pretty(self)?;
119        std::fs::write(path, json)?;
120        Ok(())
121    }
122
123    /// Load profile from file
124    pub fn load<P: AsRef<Path>>(path: P) -> Result<Self> {
125        let json = std::fs::read_to_string(path)?;
126        let profile = serde_json::from_str(&json)?;
127        Ok(profile)
128    }
129}
130
131/// Result of an extraction operation
132#[derive(Debug, Clone)]
133pub struct ExtractionResult {
134    pub text: String,
135    pub xpath: String,
136    pub quality_score: f32,
137    pub parameters: HashMap<String, f64>,
138    pub title: Option<String>,
139    pub date: Option<String>,
140}
141
142/// Site profile memory manager
143pub struct SiteProfileMemory {
144    storage_dir: PathBuf,
145    cache: HashMap<String, SiteProfile>,
146}
147
148impl SiteProfileMemory {
149    /// Create new site profile memory
150    pub fn new<P: AsRef<Path>>(storage_dir: P) -> Result<Self> {
151        let storage_dir = storage_dir.as_ref().to_path_buf();
152        std::fs::create_dir_all(&storage_dir)?;
153
154        Ok(Self {
155            storage_dir,
156            cache: HashMap::new(),
157        })
158    }
159
160    /// Get or create profile for domain
161    pub fn get_profile(&mut self, domain: &str) -> &mut SiteProfile {
162        if !self.cache.contains_key(domain) {
163            let profile_path = self.get_profile_path(domain);
164
165            let profile = if profile_path.exists() {
166                SiteProfile::load(&profile_path).unwrap_or_else(|_| SiteProfile::new(domain.to_string()))
167            } else {
168                SiteProfile::new(domain.to_string())
169            };
170
171            self.cache.insert(domain.to_string(), profile);
172        }
173
174        self.cache.get_mut(domain).unwrap()
175    }
176
177    /// Save profile to disk
178    pub fn save_profile(&self, domain: &str) -> Result<()> {
179        if let Some(profile) = self.cache.get(domain) {
180            let profile_path = self.get_profile_path(domain);
181            profile.save(profile_path)?;
182        }
183        Ok(())
184    }
185
186    /// Save all cached profiles
187    pub fn save_all(&self) -> Result<()> {
188        for (domain, profile) in self.cache.iter() {
189            let profile_path = self.get_profile_path(domain);
190            profile.save(profile_path)?;
191        }
192        Ok(())
193    }
194
195    /// Get profile file path
196    fn get_profile_path(&self, domain: &str) -> PathBuf {
197        let hash = hash_domain(domain);
198        self.storage_dir.join(format!("{}.json", hash))
199    }
200}
201
202/// Hash domain name to create filename
203fn hash_domain(domain: &str) -> String {
204    use sha2::Sha256;
205    let mut hasher = Sha256::new();
206    hasher.update(domain.as_bytes());
207    let result = hasher.finalize();
208
209    // Convert hash bytes to hex string
210    result.iter()
211        .map(|b| format!("{:02x}", b))
212        .collect::<String>()
213        .chars()
214        .take(16)
215        .collect()
216}
217
218#[cfg(test)]
219mod tests {
220    use super::*;
221    use tempfile::TempDir;
222
223    #[test]
224    fn test_site_profile() {
225        let mut profile = SiteProfile::new("example.com".to_string());
226
227        let result = ExtractionResult {
228            text: "Test content".to_string(),
229            xpath: "//article[1]".to_string(),
230            quality_score: 0.8,
231            parameters: HashMap::new(),
232            title: Some("test title".to_string()),
233            date: None,
234        };
235
236        profile.add_extraction(result);
237
238        assert_eq!(profile.quality_scores.len(), 1);
239        assert_eq!(profile.extractions.len(), 1);
240    }
241
242    #[test]
243    fn test_profile_memory() {
244        let temp_dir = TempDir::new().unwrap();
245        let mut memory = SiteProfileMemory::new(temp_dir.path()).unwrap();
246
247        let profile = memory.get_profile("example.com");
248        assert_eq!(profile.domain, "example.com");
249
250        memory.save_all().unwrap();
251    }
252}