use scraper::Html;
#[must_use]
pub fn extract_title(html: &str) -> Option<String> {
extract_title_doc(&Html::parse_document(html))
}
#[must_use]
pub fn extract_links(html: &str) -> Vec<String> {
extract_links_doc(&Html::parse_document(html), None)
}
#[must_use]
pub fn extract_links_with_base(html: &str, base_url: &str) -> Vec<String> {
let base = url::Url::parse(base_url).ok();
extract_links_doc(&Html::parse_document(html), base.as_ref())
}
fn extract_title_doc(doc: &Html) -> Option<String> {
let sel = scraper::Selector::parse("title").ok()?;
doc.select(&sel)
.next()
.map(|el| el.text().collect::<String>().trim().to_owned())
.filter(|t| !t.is_empty())
}
pub(crate) fn extract_links_doc(doc: &Html, base: Option<&url::Url>) -> Vec<String> {
let Ok(sel) = scraper::Selector::parse("a[href]") else {
return Vec::new();
};
doc.select(&sel)
.filter_map(|el| el.value().attr("href"))
.filter(|href| !href.is_empty() && !href.starts_with('#'))
.map(|href| resolve_href(href, base))
.collect()
}
fn extract_language_doc(doc: &Html) -> Option<String> {
let sel = scraper::Selector::parse("html").ok()?;
doc.select(&sel)
.next()
.and_then(|el| el.value().attr("lang"))
.map(|s| s.trim().to_owned())
.filter(|s| !s.is_empty())
}
fn extract_description_doc(doc: &Html) -> Option<String> {
extract_meta_attr(doc, r#"meta[name="description"]"#, "content")
}
fn extract_og_image_doc(doc: &Html) -> Option<String> {
extract_meta_attr(doc, r#"meta[property="og:image"]"#, "content")
}
fn extract_meta_attr(doc: &Html, selector: &str, attr: &str) -> Option<String> {
let sel = scraper::Selector::parse(selector).ok()?;
doc.select(&sel)
.next()
.and_then(|el| el.value().attr(attr))
.map(|s| s.trim().to_owned())
.filter(|s| !s.is_empty())
}
#[derive(Debug, Clone, Default)]
#[non_exhaustive]
pub struct PageMeta {
pub title: Option<String>,
pub description: Option<String>,
pub language: Option<String>,
pub og_image: Option<String>,
}
impl PageMeta {
pub(crate) fn from_doc(doc: &Html) -> Self {
Self {
title: extract_title_doc(doc),
description: extract_description_doc(doc),
language: extract_language_doc(doc),
og_image: extract_og_image_doc(doc),
}
}
}
fn resolve_href(href: &str, base_url: Option<&url::Url>) -> String {
let Some(base) = base_url else {
return href.to_owned();
};
if url::Url::parse(href).is_ok() {
return href.to_owned();
}
base.join(href)
.map_or_else(|_| href.to_owned(), |u| u.to_string())
}