Skip to main content

content_extractor_rl/
baseline_extractor.rs

1// ============================================================================
2// FILE: crates/content-extractor-rl/src/baseline_extractor.rs
3// ============================================================================
4
5use scraper::{Html, Selector, ElementRef};
6use crate::text_utils::TextUtils;
7use crate::html_parser::HtmlParser;
8use crate::site_profile::ExtractionResult;
9use crate::Result;
10use std::collections::HashSet;
11use chrono::{NaiveDate, NaiveDateTime};
12use regex::Regex;
13
14/// Baseline content extractor rl using heuristics
15#[derive(Clone)]
16pub struct BaselineExtractor {
17    stopwords: HashSet<String>,
18}
19
20impl BaselineExtractor {
21    /// Create new baseline extractor
22    pub fn new(stopwords: HashSet<String>) -> Self {
23        Self { stopwords }
24    }
25
26    /// Extract article from HTML
27    pub fn extract(&self, html: &str) -> Result<ExtractionResult> {
28        // Extract metadata first
29        let title = MetadataExtractor::extract_title(html);
30        let date = MetadataExtractor::extract_date(html);
31
32        let document = HtmlParser::clean_html(html)?;
33        let candidates = self.get_candidates(&document);
34
35        if candidates.is_empty() {
36            return Ok(ExtractionResult {
37                text: String::new(),
38                xpath: String::new(),
39                quality_score: 0.0,
40                parameters: std::collections::HashMap::new(),
41                title,
42                date,
43            });
44        }
45
46        let (best_node, _score) = candidates.into_iter()
47            .max_by(|(_, score_a), (_, score_b)| {
48                score_a.partial_cmp(score_b).unwrap_or(std::cmp::Ordering::Equal)
49            })
50            .unwrap();
51
52        let text = self.extract_text(best_node);
53        let xpath = HtmlParser::get_element_path(best_node);
54        let quality = TextUtils::calculate_text_quality(&text, &self.stopwords);
55
56        Ok(ExtractionResult {
57            text,
58            xpath,
59            quality_score: quality,
60            parameters: std::collections::HashMap::new(),
61            title,
62            date,
63        })
64    }
65
66    /// Get candidate nodes with scores
67    fn get_candidates<'a>(&self, document: &'a Html) -> Vec<(ElementRef<'a>, f64)> {
68        let mut candidates = Vec::new();
69
70        // Try different selectors
71        let selectors = vec!["article", "div", "section"];
72
73        for selector_str in selectors {
74            if let Ok(selector) = Selector::parse(selector_str) {
75                for element in document.select(&selector) {
76                    let score = self.score_node(element);
77                    if score > 0.0 {
78                        candidates.push((element, score));
79                    }
80                }
81            }
82        }
83
84        // Sort by score and take top 10
85        candidates.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
86        candidates.truncate(10);
87
88        candidates
89    }
90
91    /// Score node using stopword density
92    fn score_node(&self, node: ElementRef) -> f64 {
93        let text = HtmlParser::extract_text(node);
94
95        if text.len() < 50 {
96            return 0.0;
97        }
98
99        // Base score: stopword count squared
100        let stopword_count = TextUtils::count_stopwords(&text, &self.stopwords);
101        let mut score = (stopword_count * stopword_count) as f64;
102
103        // Boost for paragraphs
104        let paragraphs = HtmlParser::extract_paragraphs(node);
105        let paragraph_count = paragraphs.len().min(5);
106        score *= 1.0 + 0.5 * paragraph_count as f64;
107
108        // Penalty for high link density
109        if let Ok(link_selector) = Selector::parse("a") {
110            let link_text: String = node.select(&link_selector)
111                .map(|a| HtmlParser::extract_text(a))
112                .collect();
113
114            if !text.is_empty() {
115                let link_density = link_text.len() as f64 / text.len() as f64;
116                if link_density > 0.5 {
117                    score *= 1.0 - link_density;
118                }
119            }
120        }
121
122        score
123    }
124
125    /// Extract clean text from node
126    fn extract_text(&self, node: ElementRef) -> String {
127        let paragraphs = HtmlParser::extract_paragraphs(node);
128
129        let filtered: Vec<String> = paragraphs.into_iter()
130            .filter(|p| {
131                let words: Vec<_> = p.split_whitespace().collect();
132
133                // Minimum word threshold
134                if words.len() < 4 {
135                    return false;
136                }
137
138                true
139            })
140            .collect();
141
142        filtered.join("\n\n")
143    }
144
145    /// Get candidate nodes for environment
146    pub fn get_candidate_nodes<'a>(&self, document: &'a Html, top_k: usize) -> Vec<ElementRef<'a>> {
147        self.get_candidates(document)
148            .into_iter()
149            .take(top_k)
150            .map(|(node, _)| node)
151            .collect()
152    }
153}
154
155
156/// Extract metadata (title, date, author) from HTML
157pub struct MetadataExtractor;
158
159impl MetadataExtractor {
160    /// Extract title from HTML
161    pub fn extract_title(html: &str) -> Option<String> {
162        let document = Html::parse_document(html);
163
164        // Try multiple strategies in order of preference
165
166        // 1. OpenGraph meta tag
167        if let Some(title) = Self::extract_meta_tag(&document, "og:title") {
168            return Some(title);
169        }
170
171        // 2. Twitter card meta tag
172        if let Some(title) = Self::extract_meta_tag(&document, "twitter:title") {
173            return Some(title);
174        }
175
176        // 3. Article title meta tag
177        if let Some(title) = Self::extract_meta_tag(&document, "article:title") {
178            return Some(title);
179        }
180
181        // 4. Standard <title> tag
182        if let Ok(selector) = Selector::parse("title") {
183            if let Some(title_elem) = document.select(&selector).next() {
184                let title = title_elem.text().collect::<String>().trim().to_string();
185                if !title.is_empty() {
186                    return Some(Self::clean_title(&title));
187                }
188            }
189        }
190
191        // 5. h1 tag (often the article title)
192        if let Ok(selector) = Selector::parse("h1") {
193            if let Some(h1_elem) = document.select(&selector).next() {
194                let title = h1_elem.text().collect::<String>().trim().to_string();
195                if !title.is_empty() && title.len() > 10 {
196                    return Some(title);
197                }
198            }
199        }
200
201        // 6. article > header > h1
202        if let Ok(selector) = Selector::parse("article header h1, article h1") {
203            if let Some(elem) = document.select(&selector).next() {
204                let title = elem.text().collect::<String>().trim().to_string();
205                if !title.is_empty() && title.len() > 10 {
206                    return Some(title);
207                }
208            }
209        }
210
211        None
212    }
213
214    /// Extract publication date from HTML
215    pub fn extract_date(html: &str) -> Option<String> {
216        let document = Html::parse_document(html);
217
218        // Try multiple strategies
219
220        // 1. OpenGraph meta tag
221        if let Some(date) = Self::extract_meta_tag(&document, "article:published_time") {
222            if let Some(normalized) = Self::normalize_date(&date) {
223                return Some(normalized);
224            }
225        }
226
227        // 2. Schema.org meta tags
228        if let Some(date) = Self::extract_meta_tag(&document, "datePublished") {
229            if let Some(normalized) = Self::normalize_date(&date) {
230                return Some(normalized);
231            }
232        }
233
234        // 3. Standard meta tags
235        for name in &["pubdate", "publishdate", "date", "DC.date"] {
236            if let Some(date) = Self::extract_meta_tag(&document, name) {
237                if let Some(normalized) = Self::normalize_date(&date) {
238                    return Some(normalized);
239                }
240            }
241        }
242
243        // 4. time tag with datetime attribute
244        if let Ok(selector) = Selector::parse("time[datetime], time[pubdate]") {
245            if let Some(time_elem) = document.select(&selector).next() {
246                if let Some(datetime) = time_elem.value().attr("datetime")
247                    .or_else(|| time_elem.value().attr("pubdate")) {
248                    if let Some(normalized) = Self::normalize_date(datetime) {
249                        return Some(normalized);
250                    }
251                }
252            }
253        }
254
255        // 5. Common date patterns in text
256        if let Some(date) = Self::extract_date_from_text(html) {
257            return Some(date);
258        }
259
260        None
261    }
262
263    /// Extract meta tag content
264    fn extract_meta_tag(document: &Html, property: &str) -> Option<String> {
265        // Try property attribute
266        let selector_str = format!("meta[property='{}']", property);
267        if let Ok(selector) = Selector::parse(&selector_str) {
268            if let Some(elem) = document.select(&selector).next() {
269                if let Some(content) = elem.value().attr("content") {
270                    return Some(content.to_string());
271                }
272            }
273        }
274
275        // Try name attribute
276        let selector_str = format!("meta[name='{}']", property);
277        if let Ok(selector) = Selector::parse(&selector_str) {
278            if let Some(elem) = document.select(&selector).next() {
279                if let Some(content) = elem.value().attr("content") {
280                    return Some(content.to_string());
281                }
282            }
283        }
284
285        None
286    }
287
288    /// Clean title by removing site name suffixes
289    fn clean_title(title: &str) -> String {
290        // Common separators between title and site name
291        let separators = [" - ", " | ", " – ", " — ", " :: ", " » "];
292
293        for sep in &separators {
294            if let Some(pos) = title.rfind(sep) {
295                let cleaned = &title[..pos];
296                if cleaned.len() > 10 {
297                    return cleaned.trim().to_string();
298                }
299            }
300        }
301
302        title.trim().to_string()
303    }
304
305    /// Normalize date to ISO 8601 format
306    fn normalize_date(date_str: &str) -> Option<String> {
307        // Already in ISO format
308        if date_str.contains('T') || date_str.contains("Z") {
309            return Some(date_str.to_string());
310        }
311
312        // Try parsing common formats
313        let formats = [
314            "%Y-%m-%d",
315            "%Y/%m/%d",
316            "%d-%m-%Y",
317            "%d/%m/%Y",
318            "%B %d, %Y",
319            "%b %d, %Y",
320            "%d %B %Y",
321            "%d %b %Y",
322            "%Y-%m-%dT%H:%M:%S",
323            "%Y-%m-%d %H:%M:%S",
324        ];
325
326        for format in &formats {
327            if let Ok(parsed) = NaiveDate::parse_from_str(date_str, format) {
328                return Some(parsed.format("%Y-%m-%d").to_string());
329            }
330            if let Ok(parsed) = NaiveDateTime::parse_from_str(date_str, format) {
331                return Some(parsed.format("%Y-%m-%d").to_string());
332            }
333        }
334
335        None
336    }
337
338    /// Extract date from common text patterns
339    fn extract_date_from_text(html: &str) -> Option<String> {
340        lazy_static::lazy_static! {
341            static ref DATE_PATTERNS: Vec<Regex> = vec![
342                // ISO format: 2021-04-05
343                Regex::new(r"(\d{4}-\d{2}-\d{2})").unwrap(),
344                // US format: April 5, 2021
345                Regex::new(r"([A-Z][a-z]+ \d{1,2}, \d{4})").unwrap(),
346                // European: 5 April 2021
347                Regex::new(r"(\d{1,2} [A-Z][a-z]+ \d{4})").unwrap(),
348            ];
349        }
350
351        for pattern in DATE_PATTERNS.iter() {
352            if let Some(captures) = pattern.captures(html) {
353                if let Some(matched) = captures.get(1) {
354                    if let Some(normalized) = Self::normalize_date(matched.as_str()) {
355                        return Some(normalized);
356                    }
357                }
358            }
359        }
360
361        None
362    }
363}
364
365
366#[cfg(test)]
367mod tests {
368    use super::*;
369
370    #[test]
371    fn test_extract_title_from_og_tag() {
372        let html = r#"
373            <html>
374                <head>
375                    <meta property="og:title" content="Test Article Title" />
376                </head>
377            </html>
378        "#;
379
380        let title = MetadataExtractor::extract_title(html);
381        assert_eq!(title, Some("Test Article Title".to_string()));
382    }
383
384    #[test]
385    fn test_extract_title_from_title_tag() {
386        let html = r#"
387            <html>
388                <head>
389                    <title>Test Article - Site Name</title>
390                </head>
391            </html>
392        "#;
393
394        let title = MetadataExtractor::extract_title(html);
395        assert_eq!(title, Some("Test Article".to_string()));
396    }
397
398    #[test]
399    fn test_extract_date_from_meta() {
400        let html = r#"
401            <html>
402                <head>
403                    <meta property="article:published_time" content="2021-04-05T10:30:00Z" />
404                </head>
405            </html>
406        "#;
407
408        let date = MetadataExtractor::extract_date(html);
409        assert!(date.is_some());
410    }
411
412    #[test]
413    fn test_normalize_date() {
414        assert_eq!(
415            MetadataExtractor::normalize_date("2021-04-05"),
416            Some("2021-04-05".to_string())
417        );
418
419        assert_eq!(
420            MetadataExtractor::normalize_date("April 5, 2021"),
421            Some("2021-04-05".to_string())
422        );
423    }
424
425    #[test]
426    fn test_baseline_extractor() {
427        let html = r#"
428            <html>
429                <body>
430                    <article>
431                        <h1>Test Article</h1>
432                        <p>This is the first paragraph of the article.</p>
433                        <p>This is the second paragraph with more content.</p>
434                    </article>
435                </body>
436            </html>
437        "#;
438
439        let stopwords: HashSet<String> = vec!["the", "is", "of"]
440            .into_iter()
441            .map(|s| s.to_string())
442            .collect();
443
444        let extractor = BaselineExtractor::new(stopwords);
445        let result = extractor.extract(html).unwrap();
446
447        assert!(!result.text.is_empty());
448        assert!(result.quality_score > 0.0);
449    }
450}