use scraper::{Html, Selector, ElementRef};
use crate::text_utils::TextUtils;
use crate::html_parser::HtmlParser;
use crate::site_profile::ExtractionResult;
use crate::Result;
use std::collections::HashSet;
use chrono::{NaiveDate, NaiveDateTime};
use regex::Regex;
#[derive(Clone)]
pub struct BaselineExtractor {
stopwords: HashSet<String>,
}
impl BaselineExtractor {
pub fn new(stopwords: HashSet<String>) -> Self {
Self { stopwords }
}
pub fn extract(&self, html: &str) -> Result<ExtractionResult> {
let title = MetadataExtractor::extract_title(html);
let date = MetadataExtractor::extract_date(html);
let document = HtmlParser::clean_html(html)?;
let candidates = self.get_candidates(&document);
if candidates.is_empty() {
return Ok(ExtractionResult {
text: String::new(),
xpath: String::new(),
quality_score: 0.0,
parameters: std::collections::HashMap::new(),
title,
date,
});
}
let (best_node, _score) = candidates.into_iter()
.max_by(|(_, score_a), (_, score_b)| {
score_a.partial_cmp(score_b).unwrap_or(std::cmp::Ordering::Equal)
})
.unwrap();
let text = self.extract_text(best_node);
let xpath = HtmlParser::get_element_path(best_node);
let quality = TextUtils::calculate_text_quality(&text, &self.stopwords);
Ok(ExtractionResult {
text,
xpath,
quality_score: quality,
parameters: std::collections::HashMap::new(),
title,
date,
})
}
fn get_candidates<'a>(&self, document: &'a Html) -> Vec<(ElementRef<'a>, f64)> {
let mut candidates = Vec::new();
let selectors = vec!["article", "div", "section"];
for selector_str in selectors {
if let Ok(selector) = Selector::parse(selector_str) {
for element in document.select(&selector) {
let score = self.score_node(element);
if score > 0.0 {
candidates.push((element, score));
}
}
}
}
candidates.sort_by(|(_, a), (_, b)| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
candidates.truncate(10);
candidates
}
fn score_node(&self, node: ElementRef) -> f64 {
let text = HtmlParser::extract_text(node);
if text.len() < 50 {
return 0.0;
}
let stopword_count = TextUtils::count_stopwords(&text, &self.stopwords);
let mut score = (stopword_count * stopword_count) as f64;
let paragraphs = HtmlParser::extract_paragraphs(node);
let paragraph_count = paragraphs.len().min(5);
score *= 1.0 + 0.5 * paragraph_count as f64;
if let Ok(link_selector) = Selector::parse("a") {
let link_text: String = node.select(&link_selector)
.map(|a| HtmlParser::extract_text(a))
.collect();
if !text.is_empty() {
let link_density = link_text.len() as f64 / text.len() as f64;
if link_density > 0.5 {
score *= 1.0 - link_density;
}
}
}
score
}
fn extract_text(&self, node: ElementRef) -> String {
let paragraphs = HtmlParser::extract_paragraphs(node);
let filtered: Vec<String> = paragraphs.into_iter()
.filter(|p| {
let words: Vec<_> = p.split_whitespace().collect();
if words.len() < 4 {
return false;
}
true
})
.collect();
filtered.join("\n\n")
}
pub fn get_candidate_nodes<'a>(&self, document: &'a Html, top_k: usize) -> Vec<ElementRef<'a>> {
self.get_candidates(document)
.into_iter()
.take(top_k)
.map(|(node, _)| node)
.collect()
}
}
pub struct MetadataExtractor;
impl MetadataExtractor {
pub fn extract_title(html: &str) -> Option<String> {
let document = Html::parse_document(html);
if let Some(title) = Self::extract_meta_tag(&document, "og:title") {
return Some(title);
}
if let Some(title) = Self::extract_meta_tag(&document, "twitter:title") {
return Some(title);
}
if let Some(title) = Self::extract_meta_tag(&document, "article:title") {
return Some(title);
}
if let Ok(selector) = Selector::parse("title") {
if let Some(title_elem) = document.select(&selector).next() {
let title = title_elem.text().collect::<String>().trim().to_string();
if !title.is_empty() {
return Some(Self::clean_title(&title));
}
}
}
if let Ok(selector) = Selector::parse("h1") {
if let Some(h1_elem) = document.select(&selector).next() {
let title = h1_elem.text().collect::<String>().trim().to_string();
if !title.is_empty() && title.len() > 10 {
return Some(title);
}
}
}
if let Ok(selector) = Selector::parse("article header h1, article h1") {
if let Some(elem) = document.select(&selector).next() {
let title = elem.text().collect::<String>().trim().to_string();
if !title.is_empty() && title.len() > 10 {
return Some(title);
}
}
}
None
}
pub fn extract_date(html: &str) -> Option<String> {
let document = Html::parse_document(html);
if let Some(date) = Self::extract_meta_tag(&document, "article:published_time") {
if let Some(normalized) = Self::normalize_date(&date) {
return Some(normalized);
}
}
if let Some(date) = Self::extract_meta_tag(&document, "datePublished") {
if let Some(normalized) = Self::normalize_date(&date) {
return Some(normalized);
}
}
for name in &["pubdate", "publishdate", "date", "DC.date"] {
if let Some(date) = Self::extract_meta_tag(&document, name) {
if let Some(normalized) = Self::normalize_date(&date) {
return Some(normalized);
}
}
}
if let Ok(selector) = Selector::parse("time[datetime], time[pubdate]") {
if let Some(time_elem) = document.select(&selector).next() {
if let Some(datetime) = time_elem.value().attr("datetime")
.or_else(|| time_elem.value().attr("pubdate")) {
if let Some(normalized) = Self::normalize_date(datetime) {
return Some(normalized);
}
}
}
}
if let Some(date) = Self::extract_date_from_text(html) {
return Some(date);
}
None
}
fn extract_meta_tag(document: &Html, property: &str) -> Option<String> {
let selector_str = format!("meta[property='{}']", property);
if let Ok(selector) = Selector::parse(&selector_str) {
if let Some(elem) = document.select(&selector).next() {
if let Some(content) = elem.value().attr("content") {
return Some(content.to_string());
}
}
}
let selector_str = format!("meta[name='{}']", property);
if let Ok(selector) = Selector::parse(&selector_str) {
if let Some(elem) = document.select(&selector).next() {
if let Some(content) = elem.value().attr("content") {
return Some(content.to_string());
}
}
}
None
}
fn clean_title(title: &str) -> String {
let separators = [" - ", " | ", " – ", " — ", " :: ", " » "];
for sep in &separators {
if let Some(pos) = title.rfind(sep) {
let cleaned = &title[..pos];
if cleaned.len() > 10 {
return cleaned.trim().to_string();
}
}
}
title.trim().to_string()
}
fn normalize_date(date_str: &str) -> Option<String> {
if date_str.contains('T') || date_str.contains("Z") {
return Some(date_str.to_string());
}
let formats = [
"%Y-%m-%d",
"%Y/%m/%d",
"%d-%m-%Y",
"%d/%m/%Y",
"%B %d, %Y",
"%b %d, %Y",
"%d %B %Y",
"%d %b %Y",
"%Y-%m-%dT%H:%M:%S",
"%Y-%m-%d %H:%M:%S",
];
for format in &formats {
if let Ok(parsed) = NaiveDate::parse_from_str(date_str, format) {
return Some(parsed.format("%Y-%m-%d").to_string());
}
if let Ok(parsed) = NaiveDateTime::parse_from_str(date_str, format) {
return Some(parsed.format("%Y-%m-%d").to_string());
}
}
None
}
fn extract_date_from_text(html: &str) -> Option<String> {
lazy_static::lazy_static! {
static ref DATE_PATTERNS: Vec<Regex> = vec![
Regex::new(r"(\d{4}-\d{2}-\d{2})").unwrap(),
Regex::new(r"([A-Z][a-z]+ \d{1,2}, \d{4})").unwrap(),
Regex::new(r"(\d{1,2} [A-Z][a-z]+ \d{4})").unwrap(),
];
}
for pattern in DATE_PATTERNS.iter() {
if let Some(captures) = pattern.captures(html) {
if let Some(matched) = captures.get(1) {
if let Some(normalized) = Self::normalize_date(matched.as_str()) {
return Some(normalized);
}
}
}
}
None
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_title_from_og_tag() {
let html = r#"
<html>
<head>
<meta property="og:title" content="Test Article Title" />
</head>
</html>
"#;
let title = MetadataExtractor::extract_title(html);
assert_eq!(title, Some("Test Article Title".to_string()));
}
#[test]
fn test_extract_title_from_title_tag() {
let html = r#"
<html>
<head>
<title>Test Article - Site Name</title>
</head>
</html>
"#;
let title = MetadataExtractor::extract_title(html);
assert_eq!(title, Some("Test Article".to_string()));
}
#[test]
fn test_extract_date_from_meta() {
let html = r#"
<html>
<head>
<meta property="article:published_time" content="2021-04-05T10:30:00Z" />
</head>
</html>
"#;
let date = MetadataExtractor::extract_date(html);
assert!(date.is_some());
}
#[test]
fn test_normalize_date() {
assert_eq!(
MetadataExtractor::normalize_date("2021-04-05"),
Some("2021-04-05".to_string())
);
assert_eq!(
MetadataExtractor::normalize_date("April 5, 2021"),
Some("2021-04-05".to_string())
);
}
#[test]
fn test_baseline_extractor() {
let html = r#"
<html>
<body>
<article>
<h1>Test Article</h1>
<p>This is the first paragraph of the article.</p>
<p>This is the second paragraph with more content.</p>
</article>
</body>
</html>
"#;
let stopwords: HashSet<String> = vec!["the", "is", "of"]
.into_iter()
.map(|s| s.to_string())
.collect();
let extractor = BaselineExtractor::new(stopwords);
let result = extractor.extract(html).unwrap();
assert!(!result.text.is_empty());
assert!(result.quality_score > 0.0);
}
}