use std::collections::HashMap;
use std::sync::RwLock;
use url::Url;
pub struct CanonicalResolver {
canonical_map: RwLock<HashMap<String, Url>>,
respect_canonicals: bool,
}
impl Default for CanonicalResolver {
fn default() -> Self {
Self::new(true)
}
}
impl CanonicalResolver {
pub fn new(respect_canonicals: bool) -> Self {
Self {
canonical_map: RwLock::new(HashMap::new()),
respect_canonicals,
}
}
pub fn register(&self, url: &Url, canonical: &Url) {
if !self.respect_canonicals {
return;
}
let key = url.to_string();
self.canonical_map.write().unwrap().insert(key, canonical.clone());
}
pub fn resolve(&self, url: &Url) -> Url {
if !self.respect_canonicals {
return url.clone();
}
let key = url.to_string();
self.canonical_map
.read()
.unwrap()
.get(&key)
.cloned()
.unwrap_or_else(|| url.clone())
}
pub fn has_different_canonical(&self, url: &Url) -> bool {
let key = url.to_string();
if let Some(canonical) = self.canonical_map.read().unwrap().get(&key) {
canonical != url
} else {
false
}
}
pub fn count(&self) -> usize {
self.canonical_map.read().unwrap().len()
}
pub fn clear(&self) {
self.canonical_map.write().unwrap().clear();
}
pub fn resolve_from_html(&self, html: &str, base_url: &Url) -> Option<Url> {
if !self.respect_canonicals {
return None;
}
extract_canonical_from_html(html, base_url).filter(|canonical| canonical != base_url)
}
}
pub fn extract_canonical_from_html(html: &str, base_url: &Url) -> Option<Url> {
let document = scraper::Html::parse_document(html);
let selector = scraper::Selector::parse(r#"link[rel="canonical"]"#).ok()?;
document
.select(&selector)
.next()
.and_then(|el| el.value().attr("href"))
.and_then(|href| base_url.join(href).ok())
}
pub struct PaginationLinks {
pub next: Option<Url>,
pub prev: Option<Url>,
}
pub fn extract_pagination_from_html(html: &str, base_url: &Url) -> PaginationLinks {
let document = scraper::Html::parse_document(html);
let next = scraper::Selector::parse(r#"link[rel="next"]"#)
.ok()
.and_then(|sel| {
document
.select(&sel)
.next()
.and_then(|el| el.value().attr("href"))
.and_then(|href| base_url.join(href).ok())
});
let prev = scraper::Selector::parse(r#"link[rel="prev"]"#)
.ok()
.and_then(|sel| {
document
.select(&sel)
.next()
.and_then(|el| el.value().attr("href"))
.and_then(|href| base_url.join(href).ok())
});
PaginationLinks { next, prev }
}
pub fn extract_hreflang_from_html(html: &str, base_url: &Url) -> HashMap<String, Url> {
let document = scraper::Html::parse_document(html);
let selector = match scraper::Selector::parse(r#"link[rel="alternate"][hreflang]"#) {
Ok(s) => s,
Err(_) => return HashMap::new(),
};
document
.select(&selector)
.filter_map(|el| {
let lang = el.value().attr("hreflang")?;
let href = el.value().attr("href")?;
let url = base_url.join(href).ok()?;
Some((lang.to_string(), url))
})
.collect()
}