content_extractor_rl/
baseline_extractor.rs1use scraper::{Html, Selector, ElementRef};
6use crate::text_utils::TextUtils;
7use crate::html_parser::HtmlParser;
8use crate::site_profile::ExtractionResult;
9use crate::Result;
10use std::collections::HashSet;
11use chrono::{NaiveDate, NaiveDateTime};
12use regex::Regex;
13
14#[derive(Clone)]
16pub struct BaselineExtractor {
17 stopwords: HashSet<String>,
18}
19
20impl BaselineExtractor {
21 pub fn new(stopwords: HashSet<String>) -> Self {
23 Self { stopwords }
24 }
25
26 pub fn extract(&self, html: &str) -> Result<ExtractionResult> {
28 let title = MetadataExtractor::extract_title(html);
30 let date = MetadataExtractor::extract_date(html);
31
32 let document = HtmlParser::clean_html(html)?;
33 let candidates = self.get_candidates(&document);
34
35 if candidates.is_empty() {
36 return Ok(ExtractionResult {
37 text: String::new(),
38 xpath: String::new(),
39 quality_score: 0.0,
40 parameters: std::collections::HashMap::new(),
41 title,
42 date,
43 });
44 }
45
46 let (best_node, _score) = candidates.into_iter()
47 .max_by(|(_, score_a), (_, score_b)| {
48 score_a.partial_cmp(score_b).unwrap_or(std::cmp::Ordering::Equal)
49 })
50 .unwrap();
51
52 let text = self.extract_text(best_node);
53 let xpath = HtmlParser::get_element_path(best_node);
54 let quality = TextUtils::calculate_text_quality(&text, &self.stopwords);
55
56 Ok(ExtractionResult {
57 text,
58 xpath,
59 quality_score: quality,
60 parameters: std::collections::HashMap::new(),
61 title,
62 date,
63 })
64 }
65
66 fn get_candidates<'a>(&self, document: &'a Html) -> Vec<(ElementRef<'a>, f64)> {
68 let mut candidates = Vec::new();
69
70 let selectors = vec!["article", "div", "section"];
72
73 for selector_str in selectors {
74 if let Ok(selector) = Selector::parse(selector_str) {
75 for element in document.select(&selector) {
76 let score = self.score_node(element);
77 if score > 0.0 {
78 candidates.push((element, score));
79 }
80 }
81 }
82 }
83
84 candidates.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
86 candidates.truncate(10);
87
88 candidates
89 }
90
91 fn score_node(&self, node: ElementRef) -> f64 {
93 let text = HtmlParser::extract_text(node);
94
95 if text.len() < 50 {
96 return 0.0;
97 }
98
99 let stopword_count = TextUtils::count_stopwords(&text, &self.stopwords);
101 let mut score = (stopword_count * stopword_count) as f64;
102
103 let paragraphs = HtmlParser::extract_paragraphs(node);
105 let paragraph_count = paragraphs.len().min(5);
106 score *= 1.0 + 0.5 * paragraph_count as f64;
107
108 if let Ok(link_selector) = Selector::parse("a") {
110 let link_text: String = node.select(&link_selector)
111 .map(|a| HtmlParser::extract_text(a))
112 .collect();
113
114 if !text.is_empty() {
115 let link_density = link_text.len() as f64 / text.len() as f64;
116 if link_density > 0.5 {
117 score *= 1.0 - link_density;
118 }
119 }
120 }
121
122 score
123 }
124
125 fn extract_text(&self, node: ElementRef) -> String {
127 let paragraphs = HtmlParser::extract_paragraphs(node);
128
129 let filtered: Vec<String> = paragraphs.into_iter()
130 .filter(|p| {
131 let words: Vec<_> = p.split_whitespace().collect();
132
133 if words.len() < 4 {
135 return false;
136 }
137
138 true
139 })
140 .collect();
141
142 filtered.join("\n\n")
143 }
144
145 pub fn get_candidate_nodes<'a>(&self, document: &'a Html, top_k: usize) -> Vec<ElementRef<'a>> {
147 self.get_candidates(document)
148 .into_iter()
149 .take(top_k)
150 .map(|(node, _)| node)
151 .collect()
152 }
153}
154
155
156pub struct MetadataExtractor;
158
159impl MetadataExtractor {
160 pub fn extract_title(html: &str) -> Option<String> {
162 let document = Html::parse_document(html);
163
164 if let Some(title) = Self::extract_meta_tag(&document, "og:title") {
168 return Some(title);
169 }
170
171 if let Some(title) = Self::extract_meta_tag(&document, "twitter:title") {
173 return Some(title);
174 }
175
176 if let Some(title) = Self::extract_meta_tag(&document, "article:title") {
178 return Some(title);
179 }
180
181 if let Ok(selector) = Selector::parse("title") {
183 if let Some(title_elem) = document.select(&selector).next() {
184 let title = title_elem.text().collect::<String>().trim().to_string();
185 if !title.is_empty() {
186 return Some(Self::clean_title(&title));
187 }
188 }
189 }
190
191 if let Ok(selector) = Selector::parse("h1") {
193 if let Some(h1_elem) = document.select(&selector).next() {
194 let title = h1_elem.text().collect::<String>().trim().to_string();
195 if !title.is_empty() && title.len() > 10 {
196 return Some(title);
197 }
198 }
199 }
200
201 if let Ok(selector) = Selector::parse("article header h1, article h1") {
203 if let Some(elem) = document.select(&selector).next() {
204 let title = elem.text().collect::<String>().trim().to_string();
205 if !title.is_empty() && title.len() > 10 {
206 return Some(title);
207 }
208 }
209 }
210
211 None
212 }
213
214 pub fn extract_date(html: &str) -> Option<String> {
216 let document = Html::parse_document(html);
217
218 if let Some(date) = Self::extract_meta_tag(&document, "article:published_time") {
222 if let Some(normalized) = Self::normalize_date(&date) {
223 return Some(normalized);
224 }
225 }
226
227 if let Some(date) = Self::extract_meta_tag(&document, "datePublished") {
229 if let Some(normalized) = Self::normalize_date(&date) {
230 return Some(normalized);
231 }
232 }
233
234 for name in &["pubdate", "publishdate", "date", "DC.date"] {
236 if let Some(date) = Self::extract_meta_tag(&document, name) {
237 if let Some(normalized) = Self::normalize_date(&date) {
238 return Some(normalized);
239 }
240 }
241 }
242
243 if let Ok(selector) = Selector::parse("time[datetime], time[pubdate]") {
245 if let Some(time_elem) = document.select(&selector).next() {
246 if let Some(datetime) = time_elem.value().attr("datetime")
247 .or_else(|| time_elem.value().attr("pubdate")) {
248 if let Some(normalized) = Self::normalize_date(datetime) {
249 return Some(normalized);
250 }
251 }
252 }
253 }
254
255 if let Some(date) = Self::extract_date_from_text(html) {
257 return Some(date);
258 }
259
260 None
261 }
262
263 fn extract_meta_tag(document: &Html, property: &str) -> Option<String> {
265 let selector_str = format!("meta[property='{}']", property);
267 if let Ok(selector) = Selector::parse(&selector_str) {
268 if let Some(elem) = document.select(&selector).next() {
269 if let Some(content) = elem.value().attr("content") {
270 return Some(content.to_string());
271 }
272 }
273 }
274
275 let selector_str = format!("meta[name='{}']", property);
277 if let Ok(selector) = Selector::parse(&selector_str) {
278 if let Some(elem) = document.select(&selector).next() {
279 if let Some(content) = elem.value().attr("content") {
280 return Some(content.to_string());
281 }
282 }
283 }
284
285 None
286 }
287
288 fn clean_title(title: &str) -> String {
290 let separators = [" - ", " | ", " – ", " — ", " :: ", " » "];
292
293 for sep in &separators {
294 if let Some(pos) = title.rfind(sep) {
295 let cleaned = &title[..pos];
296 if cleaned.len() > 10 {
297 return cleaned.trim().to_string();
298 }
299 }
300 }
301
302 title.trim().to_string()
303 }
304
305 fn normalize_date(date_str: &str) -> Option<String> {
307 if date_str.contains('T') || date_str.contains("Z") {
309 return Some(date_str.to_string());
310 }
311
312 let formats = [
314 "%Y-%m-%d",
315 "%Y/%m/%d",
316 "%d-%m-%Y",
317 "%d/%m/%Y",
318 "%B %d, %Y",
319 "%b %d, %Y",
320 "%d %B %Y",
321 "%d %b %Y",
322 "%Y-%m-%dT%H:%M:%S",
323 "%Y-%m-%d %H:%M:%S",
324 ];
325
326 for format in &formats {
327 if let Ok(parsed) = NaiveDate::parse_from_str(date_str, format) {
328 return Some(parsed.format("%Y-%m-%d").to_string());
329 }
330 if let Ok(parsed) = NaiveDateTime::parse_from_str(date_str, format) {
331 return Some(parsed.format("%Y-%m-%d").to_string());
332 }
333 }
334
335 None
336 }
337
338 fn extract_date_from_text(html: &str) -> Option<String> {
340 lazy_static::lazy_static! {
341 static ref DATE_PATTERNS: Vec<Regex> = vec![
342 Regex::new(r"(\d{4}-\d{2}-\d{2})").unwrap(),
344 Regex::new(r"([A-Z][a-z]+ \d{1,2}, \d{4})").unwrap(),
346 Regex::new(r"(\d{1,2} [A-Z][a-z]+ \d{4})").unwrap(),
348 ];
349 }
350
351 for pattern in DATE_PATTERNS.iter() {
352 if let Some(captures) = pattern.captures(html) {
353 if let Some(matched) = captures.get(1) {
354 if let Some(normalized) = Self::normalize_date(matched.as_str()) {
355 return Some(normalized);
356 }
357 }
358 }
359 }
360
361 None
362 }
363}
364
365
366#[cfg(test)]
367mod tests {
368 use super::*;
369
370 #[test]
371 fn test_extract_title_from_og_tag() {
372 let html = r#"
373 <html>
374 <head>
375 <meta property="og:title" content="Test Article Title" />
376 </head>
377 </html>
378 "#;
379
380 let title = MetadataExtractor::extract_title(html);
381 assert_eq!(title, Some("Test Article Title".to_string()));
382 }
383
384 #[test]
385 fn test_extract_title_from_title_tag() {
386 let html = r#"
387 <html>
388 <head>
389 <title>Test Article - Site Name</title>
390 </head>
391 </html>
392 "#;
393
394 let title = MetadataExtractor::extract_title(html);
395 assert_eq!(title, Some("Test Article".to_string()));
396 }
397
398 #[test]
399 fn test_extract_date_from_meta() {
400 let html = r#"
401 <html>
402 <head>
403 <meta property="article:published_time" content="2021-04-05T10:30:00Z" />
404 </head>
405 </html>
406 "#;
407
408 let date = MetadataExtractor::extract_date(html);
409 assert!(date.is_some());
410 }
411
412 #[test]
413 fn test_normalize_date() {
414 assert_eq!(
415 MetadataExtractor::normalize_date("2021-04-05"),
416 Some("2021-04-05".to_string())
417 );
418
419 assert_eq!(
420 MetadataExtractor::normalize_date("April 5, 2021"),
421 Some("2021-04-05".to_string())
422 );
423 }
424
425 #[test]
426 fn test_baseline_extractor() {
427 let html = r#"
428 <html>
429 <body>
430 <article>
431 <h1>Test Article</h1>
432 <p>This is the first paragraph of the article.</p>
433 <p>This is the second paragraph with more content.</p>
434 </article>
435 </body>
436 </html>
437 "#;
438
439 let stopwords: HashSet<String> = vec!["the", "is", "of"]
440 .into_iter()
441 .map(|s| s.to_string())
442 .collect();
443
444 let extractor = BaselineExtractor::new(stopwords);
445 let result = extractor.extract(html).unwrap();
446
447 assert!(!result.text.is_empty());
448 assert!(result.quality_score > 0.0);
449 }
450}