content_extractor_rl/
site_profile.rs1use serde::{Deserialize, Serialize};
7use chrono::{DateTime, Utc};
8use std::collections::HashMap;
9use std::path::{Path, PathBuf};
10use crate::Result;
11use sha2::Digest;
12
13#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct SiteProfile {
16 pub domain: String,
17 pub extractions: Vec<ExtractionRecord>,
18 pub successful_xpaths: HashMap<String, usize>,
19 pub avg_parameters: HashMap<String, Vec<f64>>,
20 pub quality_scores: Vec<f32>,
21 pub last_updated: DateTime<Utc>,
22}
23
24#[derive(Debug, Clone, Serialize, Deserialize)]
25pub struct ExtractionRecord {
26 pub timestamp: DateTime<Utc>,
27 pub quality_score: f32,
28 pub xpath: String,
29 pub parameters: HashMap<String, f64>,
30 pub text_length: usize,
31}
32
33impl SiteProfile {
34 pub fn new(domain: String) -> Self {
36 Self {
37 domain,
38 extractions: Vec::new(),
39 successful_xpaths: HashMap::new(),
40 avg_parameters: HashMap::new(),
41 quality_scores: Vec::new(),
42 last_updated: Utc::now(),
43 }
44 }
45
46 pub fn add_extraction(&mut self, result: ExtractionResult) {
48 let record = ExtractionRecord {
49 timestamp: Utc::now(),
50 quality_score: result.quality_score,
51 xpath: result.xpath.clone(),
52 parameters: result.parameters.clone(),
53 text_length: result.text.len(),
54 };
55
56 self.extractions.push(record);
57
58 if result.quality_score > 0.7 {
60 if !result.xpath.is_empty() {
61 *self.successful_xpaths.entry(result.xpath.clone()).or_insert(0) += 1;
62 }
63
64 for (key, value) in result.parameters.iter() {
65 self.avg_parameters.entry(key.clone())
66 .or_default()
67 .push(*value);
68 }
69 }
70
71 self.quality_scores.push(result.quality_score);
72 self.last_updated = Utc::now();
73
74 if self.extractions.len() > 1000 {
76 self.extractions = self.extractions.split_off(self.extractions.len() - 1000);
77 }
78 }
79
80 pub fn get_best_xpath(&self) -> Option<&String> {
82 self.successful_xpaths.iter()
83 .max_by_key(|(_, count)| *count)
84 .map(|(xpath, _)| xpath)
85 }
86
87 pub fn get_recommended_parameters(&self) -> HashMap<String, f64> {
89 let mut recommended = HashMap::new();
90
91 for (param, values) in self.avg_parameters.iter() {
92 if !values.is_empty() {
93 let mut sorted = values.clone();
94 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
95 let median = sorted[sorted.len() / 2];
96 recommended.insert(param.clone(), median);
97 }
98 }
99
100 recommended
101 }
102
103 pub fn get_success_rate(&self) -> f32 {
105 if self.quality_scores.is_empty() {
106 return 0.0;
107 }
108
109 let successful = self.quality_scores.iter()
110 .filter(|&&score| score > 0.7)
111 .count();
112
113 successful as f32 / self.quality_scores.len() as f32
114 }
115
116 pub fn save<P: AsRef<Path>>(&self, path: P) -> Result<()> {
118 let json = serde_json::to_string_pretty(self)?;
119 std::fs::write(path, json)?;
120 Ok(())
121 }
122
123 pub fn load<P: AsRef<Path>>(path: P) -> Result<Self> {
125 let json = std::fs::read_to_string(path)?;
126 let profile = serde_json::from_str(&json)?;
127 Ok(profile)
128 }
129}
130
131#[derive(Debug, Clone)]
133pub struct ExtractionResult {
134 pub text: String,
135 pub xpath: String,
136 pub quality_score: f32,
137 pub parameters: HashMap<String, f64>,
138 pub title: Option<String>,
139 pub date: Option<String>,
140}
141
142pub struct SiteProfileMemory {
144 storage_dir: PathBuf,
145 cache: HashMap<String, SiteProfile>,
146}
147
148impl SiteProfileMemory {
149 pub fn new<P: AsRef<Path>>(storage_dir: P) -> Result<Self> {
151 let storage_dir = storage_dir.as_ref().to_path_buf();
152 std::fs::create_dir_all(&storage_dir)?;
153
154 Ok(Self {
155 storage_dir,
156 cache: HashMap::new(),
157 })
158 }
159
160 pub fn get_profile(&mut self, domain: &str) -> &mut SiteProfile {
162 if !self.cache.contains_key(domain) {
163 let profile_path = self.get_profile_path(domain);
164
165 let profile = if profile_path.exists() {
166 SiteProfile::load(&profile_path).unwrap_or_else(|_| SiteProfile::new(domain.to_string()))
167 } else {
168 SiteProfile::new(domain.to_string())
169 };
170
171 self.cache.insert(domain.to_string(), profile);
172 }
173
174 self.cache.get_mut(domain).unwrap()
175 }
176
177 pub fn save_profile(&self, domain: &str) -> Result<()> {
179 if let Some(profile) = self.cache.get(domain) {
180 let profile_path = self.get_profile_path(domain);
181 profile.save(profile_path)?;
182 }
183 Ok(())
184 }
185
186 pub fn save_all(&self) -> Result<()> {
188 for (domain, profile) in self.cache.iter() {
189 let profile_path = self.get_profile_path(domain);
190 profile.save(profile_path)?;
191 }
192 Ok(())
193 }
194
195 fn get_profile_path(&self, domain: &str) -> PathBuf {
197 let hash = hash_domain(domain);
198 self.storage_dir.join(format!("{}.json", hash))
199 }
200}
201
202fn hash_domain(domain: &str) -> String {
204 use sha2::Sha256;
205 let mut hasher = Sha256::new();
206 hasher.update(domain.as_bytes());
207 let result = hasher.finalize();
208
209 result.iter()
211 .map(|b| format!("{:02x}", b))
212 .collect::<String>()
213 .chars()
214 .take(16)
215 .collect()
216}
217
218#[cfg(test)]
219mod tests {
220 use super::*;
221 use tempfile::TempDir;
222
223 #[test]
224 fn test_site_profile() {
225 let mut profile = SiteProfile::new("example.com".to_string());
226
227 let result = ExtractionResult {
228 text: "Test content".to_string(),
229 xpath: "//article[1]".to_string(),
230 quality_score: 0.8,
231 parameters: HashMap::new(),
232 title: Some("test title".to_string()),
233 date: None,
234 };
235
236 profile.add_extraction(result);
237
238 assert_eq!(profile.quality_scores.len(), 1);
239 assert_eq!(profile.extractions.len(), 1);
240 }
241
242 #[test]
243 fn test_profile_memory() {
244 let temp_dir = TempDir::new().unwrap();
245 let mut memory = SiteProfileMemory::new(temp_dir.path()).unwrap();
246
247 let profile = memory.get_profile("example.com");
248 assert_eq!(profile.domain, "example.com");
249
250 memory.save_all().unwrap();
251 }
252}