use ego_tree::NodeId;
use scraper::{Html, Selector};
use std::collections::HashSet;
use url::Url;
use crate::types::document::{LinkType, OutLink};
use crate::crawl::normalize::UrlNormalizer;
pub struct LinkExtractor {
normalizer: UrlNormalizer,
extract_navigation: bool,
extract_footer: bool,
}
impl Default for LinkExtractor {
fn default() -> Self {
Self {
normalizer: UrlNormalizer::default(),
extract_navigation: true,
extract_footer: true,
}
}
}
impl LinkExtractor {
pub fn new() -> Self {
Self::default()
}
pub fn with_options(mut self, extract_navigation: bool, extract_footer: bool) -> Self {
self.extract_navigation = extract_navigation;
self.extract_footer = extract_footer;
self
}
pub fn extract(&self, html: &str, base_url: &Url) -> Vec<OutLink> {
let document = Html::parse_document(html);
let mut links = Vec::new();
let a_selector = Selector::parse("a[href]").unwrap();
let nav_selector = Selector::parse("nav, header, .navigation, .menu, [role=\"navigation\"]").unwrap();
let footer_selector = Selector::parse("footer, .footer, [role=\"contentinfo\"]").unwrap();
let aside_selector = Selector::parse("aside, .sidebar, [role=\"complementary\"]").unwrap();
let nav_ids: std::collections::HashSet<_> = document.select(&nav_selector).map(|e| e.id()).collect();
let footer_ids: std::collections::HashSet<_> = document.select(&footer_selector).map(|e| e.id()).collect();
let aside_ids: std::collections::HashSet<_> = document.select(&aside_selector).map(|e| e.id()).collect();
for element in document.select(&a_selector) {
let href = match element.value().attr("href") {
Some(h) => h,
None => continue,
};
if href.is_empty() || href.starts_with("javascript:") || href.starts_with("mailto:") || href.starts_with("tel:") {
continue;
}
let url = match base_url.join(href) {
Ok(u) => self.normalizer.normalize(&u),
Err(_) => continue,
};
if !url.scheme().starts_with("http") {
continue;
}
let link_type = self.determine_link_type(&element, &nav_ids, &footer_ids, &aside_ids);
if !self.extract_navigation && link_type == LinkType::Navigation {
continue;
}
if !self.extract_footer && matches!(link_type, LinkType::Navigation) {
}
let anchor_text = element.text().collect::<Vec<_>>().join(" ").trim().to_string();
let anchor_text = if anchor_text.is_empty() { None } else { Some(anchor_text) };
let rel = element.value().attr("rel").map(String::from);
let is_internal = url.host() == base_url.host();
links.push(OutLink {
url,
anchor_text,
rel,
is_internal,
link_type,
});
}
links.sort_by(|a, b| a.url.as_str().cmp(b.url.as_str()));
links.dedup_by(|a, b| a.url == b.url);
links
}
fn determine_link_type(
&self,
element: &scraper::ElementRef,
nav_ids: &HashSet<NodeId>,
footer_ids: &HashSet<NodeId>,
aside_ids: &HashSet<NodeId>,
) -> LinkType {
let mut parent = element.parent();
while let Some(p) = parent {
if nav_ids.contains(&p.id()) {
return LinkType::Navigation;
}
if footer_ids.contains(&p.id()) {
return LinkType::Navigation; }
if aside_ids.contains(&p.id()) {
return LinkType::Other;
}
parent = p.parent();
}
LinkType::Content
}
pub fn extract_canonical(&self, html: &str, base_url: &Url) -> Option<Url> {
let document = Html::parse_document(html);
let selector = Selector::parse(r#"link[rel="canonical"]"#).ok()?;
document
.select(&selector)
.next()
.and_then(|el| el.value().attr("href"))
.and_then(|href| base_url.join(href).ok())
.map(|u| self.normalizer.normalize(&u))
}
pub fn extract_pagination(&self, html: &str, base_url: &Url) -> PaginationLinks {
let document = Html::parse_document(html);
let next = Selector::parse(r#"link[rel="next"], a[rel="next"]"#)
.ok()
.and_then(|sel| {
document.select(&sel).next()
.and_then(|el| el.value().attr("href"))
.and_then(|href| base_url.join(href).ok())
});
let prev = Selector::parse(r#"link[rel="prev"], a[rel="prev"]"#)
.ok()
.and_then(|sel| {
document.select(&sel).next()
.and_then(|el| el.value().attr("href"))
.and_then(|href| base_url.join(href).ok())
});
PaginationLinks { next, prev }
}
pub fn extract_hreflang(&self, html: &str, base_url: &Url) -> Vec<HreflangLink> {
let document = Html::parse_document(html);
let selector = match Selector::parse(r#"link[rel="alternate"][hreflang]"#) {
Ok(s) => s,
Err(_) => return Vec::new(),
};
document
.select(&selector)
.filter_map(|el| {
let lang = el.value().attr("hreflang")?;
let href = el.value().attr("href")?;
let url = base_url.join(href).ok()?;
Some(HreflangLink {
lang: lang.to_string(),
url,
})
})
.collect()
}
}
#[derive(Debug, Clone)]
pub struct PaginationLinks {
pub next: Option<Url>,
pub prev: Option<Url>,
}
#[derive(Debug, Clone)]
pub struct HreflangLink {
pub lang: String,
pub url: Url,
}