use scraper::{Html, Selector};
use crate::dom;
use crate::types::Metadata;
#[must_use]
pub fn extract_metadata(
html: &Html,
url: Option<&str>,
schema: Option<&serde_json::Value>,
) -> Metadata {
let author = extract_author(html, schema);
let site_name = extract_site_name(html, schema, author.as_deref());
let domain = extract_domain(url);
let title = extract_title(html, schema, site_name.as_deref(), domain.as_deref());
let published = extract_published(html, schema);
let modified = extract_modified(html, schema);
let description = extract_description(html, schema);
let image = extract_image(html, schema);
let language = extract_language(html, schema);
let favicon = extract_favicon(html, url);
let canonical_url = extract_canonical_url(html, url);
let keywords = extract_keywords(html);
let content_type = extract_content_type(html, schema);
Metadata {
title,
description,
domain,
favicon,
image,
language,
published,
modified,
author,
site_name,
canonical_url,
keywords,
content_type,
}
}
fn get_meta_content(html: &Html, attr: &str, value: &str) -> Option<String> {
let selector_str = format!("meta[{attr}=\"{value}\"]");
let Ok(sel) = Selector::parse(&selector_str) else {
return None;
};
let element = html.select(&sel).next()?;
let content = element.value().attr("content")?;
let trimmed = content.trim();
if trimmed.is_empty() {
return None;
}
Some(trimmed.to_string())
}
fn get_dc_content(html: &Html, field: &str) -> Option<String> {
let variants = [
format!("DC.{field}"),
format!("dc.{field}"),
format!("dc:{field}"),
format!("dcterm:{field}"),
format!("DCTERMS.{field}"),
format!("dcterms.{field}"),
];
for name in &variants {
if let Some(v) = get_meta_content(html, "name", name) {
return Some(v);
}
}
None
}
fn walk_schema_path(value: &serde_json::Value, path: &str) -> Option<String> {
let mut current = value;
for key in path.split('.') {
current = current.get(key)?;
}
let text = current.as_str()?.trim();
if text.is_empty() {
return None;
}
Some(text.to_string())
}
fn schema_str(schema: Option<&serde_json::Value>, path: &str) -> Option<String> {
let data = schema?;
if let Some(result) = walk_schema_path(data, path) {
return Some(result);
}
if let serde_json::Value::Array(items) = data {
for item in items {
if let Some(result) = walk_schema_path(item, path) {
return Some(result);
}
}
}
None
}
fn extract_title(
html: &Html,
schema: Option<&serde_json::Value>,
site_name: Option<&str>,
domain: Option<&str>,
) -> Option<String> {
let meta_title = get_meta_content(html, "property", "og:title")
.or_else(|| get_meta_content(html, "name", "twitter:title"))
.or_else(|| get_meta_content(html, "property", "twitter:title"))
.or_else(|| schema_str(schema, "headline"))
.or_else(|| get_dc_content(html, "title"))
.or_else(|| get_meta_content(html, "name", "title"))
.or_else(|| get_meta_content(html, "name", "sailthru.title"))
.or_else(|| get_meta_content(html, "name", "parsely-title"));
let html_title = title_element_text(html);
let title = meta_title.clone().or(html_title.clone())?;
let effective_site_name = match site_name {
Some(s) => s.to_string(),
None => domain_to_site_name(domain.unwrap_or("")),
};
let cleaned = clean_title(
&title,
&effective_site_name,
meta_title.as_deref(),
html_title.as_deref(),
);
if cleaned.is_empty() {
None
} else {
Some(cleaned)
}
}
fn title_element_text(html: &Html) -> Option<String> {
let Ok(sel) = Selector::parse("title") else {
return None;
};
let el = html.select(&sel).next()?;
let text = dom::text_content(html, el.id());
let trimmed = text.trim();
if trimmed.is_empty() {
return None;
}
Some(trimmed.to_string())
}
const TITLE_SEPARATORS: &[&str] = &[" | ", " - ", " -- ", " / ", " · "];
#[must_use]
pub fn clean_title(
title: &str,
site_name: &str,
meta_title: Option<&str>,
html_title: Option<&str>,
) -> String {
if let Some(meta) = meta_title
&& let Some(html_t) = html_title
&& let Some(cleaned) = strip_via_html_comparison(meta, html_t, site_name)
{
return cleaned;
}
if !site_name.is_empty() {
return strip_site_name(title, site_name);
}
title.to_string()
}
fn strip_via_html_comparison(
meta_title: &str,
html_title: &str,
site_name: &str,
) -> Option<String> {
if !site_name.is_empty() {
let html_cleaned = strip_site_name(html_title, site_name);
if meta_title == html_cleaned {
let further = strip_last_breadcrumb_segment(meta_title);
if further != meta_title {
return Some(further);
}
}
let cleaned = strip_site_name(meta_title, site_name);
return Some(cleaned);
}
if html_title.len() > meta_title.len() && html_title.starts_with(meta_title) {
return Some(meta_title.to_string());
}
None
}
fn strip_site_name(title: &str, site_name: &str) -> String {
let site_lower = site_name.to_lowercase();
for sep in TITLE_SEPARATORS {
let Some(idx) = title.rfind(sep) else {
continue;
};
let before = title[..idx].trim();
let after = title[idx + sep.len()..].trim();
if before.is_empty() || after.is_empty() {
continue;
}
let after_lower = after.to_lowercase();
let before_lower = before.to_lowercase();
let after_matches = is_site_name_match(&after_lower, &site_lower);
let before_matches = is_site_name_match(&before_lower, &site_lower);
if after_matches && before_matches {
return if before.len() >= after.len() {
before.to_string()
} else {
after.to_string()
};
}
if after_matches {
return before.to_string();
}
if before_matches {
return after.to_string();
}
}
title.to_string()
}
fn is_site_name_match(segment: &str, site_name_lower: &str) -> bool {
if segment == site_name_lower {
return true;
}
if segment.contains(site_name_lower) || site_name_lower.contains(segment) {
return true;
}
let seg_first_word = segment.split_whitespace().next().unwrap_or("");
let site_first_word = site_name_lower.split_whitespace().next().unwrap_or("");
if segment.contains(' ') {
return false;
}
if !seg_first_word.is_empty() && !site_first_word.is_empty() && seg_first_word.len() >= 4 {
let common_prefix_len = seg_first_word
.chars()
.zip(site_first_word.chars())
.take_while(|(a, b)| a == b)
.count();
if common_prefix_len >= 5 {
return true;
}
}
false
}
fn strip_last_breadcrumb_segment(title: &str) -> String {
for sep in TITLE_SEPARATORS {
let Some(idx) = title.rfind(sep) else {
continue;
};
let before = title[..idx].trim();
let after = title[idx + sep.len()..].trim();
if before.is_empty() || after.is_empty() {
continue;
}
let is_path = after.contains('/');
let is_short = after.split_whitespace().count() <= 3;
if is_path || is_short {
return before.to_string();
}
}
title.to_string()
}
fn extract_author(html: &Html, schema: Option<&serde_json::Value>) -> Option<String> {
let raw = extract_author_raw(html, schema)?;
let cleaned = clean_author(&raw);
if cleaned.is_empty() {
None
} else {
Some(cleaned)
}
}
fn clean_author(author: &str) -> String {
let mut result = author.to_string();
for sep in [" - ", " | ", " · "] {
if let Some(idx) = result.find(sep) {
let after = result[idx + sep.len()..].trim();
if after.starts_with("http://") || after.starts_with("https://") {
result = result[..idx].trim().to_string();
}
}
}
if result.starts_with("http://") || result.starts_with("https://") {
return String::new();
}
result
}
fn extract_author_raw(html: &Html, schema: Option<&serde_json::Value>) -> Option<String> {
if let Some(v) = get_meta_content(html, "property", "author")
.or_else(|| get_meta_content(html, "name", "author"))
.or_else(|| get_meta_content(html, "property", "article:author"))
.or_else(|| get_meta_content(html, "property", "article:author_name"))
{
return Some(v);
}
if let Some(v) = get_meta_content(html, "name", "sailthru.author") {
return Some(v);
}
if let Some(v) = schema_author(schema) {
return Some(v);
}
if let Some(v) = get_meta_content(html, "name", "byl") {
return Some(v);
}
if let Some(v) = get_meta_content(html, "name", "authorList") {
return Some(v);
}
if let Some(v) =
get_meta_content(html, "name", "citation_author").map(|s| reverse_citation_author(&s))
{
return Some(v);
}
if let Some(v) = get_dc_content(html, "creator") {
return Some(v);
}
if let Some(v) = get_meta_content(html, "name", "parsely-author") {
return Some(v);
}
if let Some(v) = itemprop_author(html) {
return Some(v);
}
if let Some(v) = class_author(html) {
return Some(v);
}
if let Some(v) = author_href_elements(html) {
return Some(v);
}
if let Some(v) = authors_link_elements(html) {
return Some(v);
}
None
}
fn reverse_citation_author(name: &str) -> String {
if let Some((last, first)) = name.split_once(',') {
let first = first.trim();
let last = last.trim();
if !first.is_empty() && !last.is_empty() {
return format!("{first} {last}");
}
}
name.to_string()
}
fn author_href_elements(html: &Html) -> Option<String> {
let Ok(sel) = Selector::parse("[href*=\"/author/\"]") else {
return None;
};
let mut names = Vec::new();
for el in html.select(&sel) {
let text = dom::text_content(html, el.id());
let trimmed = text.trim().to_string();
if !trimmed.is_empty() {
names.push(trimmed);
}
if names.len() >= 3 {
break;
}
}
if names.is_empty() {
return None;
}
Some(names.join(", "))
}
fn authors_link_elements(html: &Html) -> Option<String> {
let Ok(sel) = Selector::parse(".authors a") else {
return None;
};
let mut names = Vec::new();
for el in html.select(&sel) {
let text = dom::text_content(html, el.id());
let trimmed = text.trim().to_string();
if !trimmed.is_empty() {
names.push(trimmed);
}
if names.len() >= 3 {
break;
}
}
if names.is_empty() {
return None;
}
Some(names.join(", "))
}
fn extract_author_from_value(author: &serde_json::Value) -> Option<String> {
if let Some(name) = author.get("name").and_then(|v| v.as_str()) {
let trimmed = name.trim();
if !trimmed.is_empty() {
return Some(trimmed.to_string());
}
}
if let Some(s) = author.as_str() {
let trimmed = s.trim();
if !trimmed.is_empty() {
return Some(trimmed.to_string());
}
}
if let serde_json::Value::Array(authors) = author {
let names: Vec<&str> = authors
.iter()
.filter_map(|a| {
a.get("name")
.and_then(|v| v.as_str())
.or_else(|| a.as_str())
})
.map(str::trim)
.filter(|s| !s.is_empty())
.collect();
if !names.is_empty() {
return Some(names.join(", "));
}
}
None
}
fn schema_author(schema: Option<&serde_json::Value>) -> Option<String> {
let data = schema?;
if let Some(author) = data.get("author")
&& let Some(result) = extract_author_from_value(author)
{
return Some(result);
}
if let serde_json::Value::Array(items) = data {
for item in items {
if let Some(author) = item.get("author")
&& let Some(result) = extract_author_from_value(author)
{
return Some(result);
}
}
}
None
}
fn itemprop_author(html: &Html) -> Option<String> {
let Ok(sel) = Selector::parse("[itemprop=\"author\"]") else {
return None;
};
let el = html.select(&sel).next()?;
let text = dom::text_content(html, el.id());
let trimmed = text.trim();
if trimmed.is_empty() {
return None;
}
Some(trimmed.to_string())
}
fn class_author(html: &Html) -> Option<String> {
let selectors = ["[rel=\"author\"]", "[itemprop=\"author\"]", ".author"];
for sel_str in selectors {
let Ok(sel) = Selector::parse(sel_str) else {
continue;
};
for el in html.select(&sel) {
if is_inside_comment_section(html, el.id()) {
continue;
}
let text = dom::text_content(html, el.id());
let trimmed = text.trim().to_string();
if !trimmed.is_empty() && trimmed.split_whitespace().count() <= 6 {
return Some(trimmed);
}
}
}
None
}
fn is_inside_comment_section(html: &Html, node_id: ego_tree::NodeId) -> bool {
let mut current = node_id;
loop {
if let Some(class) = dom::get_attr(html, current, "class") {
let lower = class.to_lowercase();
if lower.contains("comment")
|| lower.contains("mention")
|| lower.contains("repli")
|| lower.contains("backlink")
{
return true;
}
}
let Some(parent) = dom::parent_element(html, current) else {
return false;
};
current = parent;
}
}
fn extract_published(html: &Html, schema: Option<&serde_json::Value>) -> Option<String> {
schema_str(schema, "datePublished")
.or_else(|| get_meta_content(html, "name", "publishDate"))
.or_else(|| get_meta_content(html, "property", "article:published_time"))
.or_else(|| get_dc_content(html, "date"))
.or_else(|| get_meta_content(html, "name", "DCTERMS.created"))
.or_else(|| get_meta_content(html, "name", "DCTERMS.issued"))
.or_else(|| get_meta_content(html, "name", "parsely-pub-date"))
.or_else(|| abbr_date_published(html))
.or_else(|| get_meta_content(html, "name", "sailthru.date"))
.or_else(|| first_time_element(html))
.or_else(|| first_relative_time_element(html))
.or_else(|| date_from_text_elements(html))
}
fn extract_modified(html: &Html, schema: Option<&serde_json::Value>) -> Option<String> {
schema_str(schema, "dateModified")
.or_else(|| get_meta_content(html, "property", "article:modified_time"))
.or_else(|| get_meta_content(html, "property", "og:updated_time"))
.or_else(|| get_meta_content(html, "name", "DCTERMS.modified"))
.or_else(|| get_meta_content(html, "name", "dcterms.modified"))
.or_else(|| get_meta_content(html, "http-equiv", "last-modified"))
}
fn abbr_date_published(html: &Html) -> Option<String> {
let Ok(sel) = Selector::parse("abbr[itemprop=\"datePublished\"]") else {
return None;
};
let el = html.select(&sel).next()?;
if let Some(dt) = el.value().attr("datetime") {
let trimmed = dt.trim();
if !trimmed.is_empty() {
return Some(trimmed.to_string());
}
}
let text = dom::text_content(html, el.id());
let trimmed = text.trim();
if trimmed.is_empty() {
return None;
}
Some(trimmed.to_string())
}
fn first_time_element(html: &Html) -> Option<String> {
let Ok(sel) = Selector::parse("time") else {
return None;
};
let candidates: Vec<_> = html.select(&sel).take(5).collect();
for el in &candidates {
if let Some(dt) = el.value().attr("datetime") {
let trimmed = dt.trim();
if !trimmed.is_empty() {
return Some(trimmed.to_string());
}
}
}
for el in &candidates {
let text = dom::text_content(html, el.id());
let trimmed = text.trim();
if !trimmed.is_empty() {
return Some(trimmed.to_string());
}
}
None
}
fn first_relative_time_element(html: &Html) -> Option<String> {
let Ok(sel) = Selector::parse("relative-time[datetime]") else {
return None;
};
html.select(&sel)
.next()
.and_then(|el| el.value().attr("datetime"))
.map(|dt| dt.trim().to_string())
.filter(|s| !s.is_empty())
}
fn date_from_text_elements(html: &Html) -> Option<String> {
use std::sync::LazyLock;
static DATE_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
regex::Regex::new(concat!(
r"(?:",
r"(?:January|February|March|April|May|June|July|August|",
r"September|October|November|December)\s+\d{1,2},?\s+\d{4}",
r"|",
r"\d{1,2}\s+(?:January|February|March|April|May|June|July|August|",
r"September|October|November|December)\s+\d{4}",
r"|",
r"\d{4}-\d{2}-\d{2}",
r")",
))
.expect("date regex is valid")
});
let selectors = [
"li[id*=\"lastmod\"]",
"li[id*=\"date\"]",
"[class*=\"dateline\"]",
"h2",
"h3",
"h6",
"[class*=\"byline\"]",
"[class*=\"meta\"]",
"p",
"span",
];
for sel_str in selectors {
let Ok(sel) = Selector::parse(sel_str) else {
continue;
};
for el in html.select(&sel).take(20) {
let text = dom::text_content(html, el.id());
let trimmed = text.trim();
if trimmed.len() > 300 {
continue;
}
if let Some(m) = DATE_RE.find(trimmed) {
return parse_date_match(m.as_str());
}
}
}
None
}
fn parse_date_match(s: &str) -> Option<String> {
if s.len() == 10 {
let b = s.as_bytes();
if b.get(4) == Some(&b'-') && b.get(7) == Some(&b'-') {
let year: u32 = s.get(0..4)?.parse().ok()?;
let month: u32 = s.get(5..7)?.parse().ok()?;
let day: u32 = s.get(8..10)?.parse().ok()?;
if (1900..=2100).contains(&year) && (1..=12).contains(&month) && (1..=31).contains(&day)
{
return Some(s.to_string());
}
}
}
if let Some(iso) = normalize_english_date(s) {
return Some(iso);
}
normalize_dd_month_yyyy(s)
}
fn normalize_english_date(s: &str) -> Option<String> {
let months = [
("January", "01"),
("February", "02"),
("March", "03"),
("April", "04"),
("May", "05"),
("June", "06"),
("July", "07"),
("August", "08"),
("September", "09"),
("October", "10"),
("November", "11"),
("December", "12"),
];
for (name, num) in months {
if let Some(rest) = s.strip_prefix(name) {
let rest = rest.trim().trim_start_matches(',').trim();
let parts: Vec<&str> = rest.split_whitespace().collect();
if parts.len() >= 2 {
let day = parts[0].trim_end_matches(',');
let year = parts[1];
if let (Ok(d), Ok(y)) = (day.parse::<u32>(), year.parse::<u32>()) {
if (1..=31).contains(&d) && (1900..=2100).contains(&y) {
return Some(format!("{y}-{num}-{d:02}"));
}
}
}
}
}
None
}
fn normalize_dd_month_yyyy(s: &str) -> Option<String> {
let months = [
("January", "01"),
("February", "02"),
("March", "03"),
("April", "04"),
("May", "05"),
("June", "06"),
("July", "07"),
("August", "08"),
("September", "09"),
("October", "10"),
("November", "11"),
("December", "12"),
];
let parts: Vec<&str> = s.split_whitespace().collect();
if parts.len() < 3 {
return None;
}
let day: u32 = parts[0].parse().ok()?;
let month_name = parts[1];
let year: u32 = parts[2].trim_end_matches(',').parse().ok()?;
if !(1..=31).contains(&day) || !(1900..=2100).contains(&year) {
return None;
}
let month_num = months.iter().find(|(name, _)| *name == month_name)?.1;
Some(format!("{year}-{month_num}-{day:02}"))
}
fn extract_site_name(
html: &Html,
schema: Option<&serde_json::Value>,
author: Option<&str>,
) -> Option<String> {
schema_str(schema, "publisher.name")
.or_else(|| get_meta_content(html, "property", "og:site_name"))
.or_else(|| schema_graph_website_name(schema))
.or_else(|| schema_str(schema, "sourceOrganization.name"))
.or_else(|| get_meta_content(html, "name", "copyright"))
.or_else(|| schema_str(schema, "copyrightHolder.name"))
.or_else(|| schema_str(schema, "isPartOf.name"))
.or_else(|| get_dc_content(html, "publisher"))
.or_else(|| get_meta_content(html, "name", "application-name"))
.or_else(|| {
let a = author?;
if a.split_whitespace().count() <= 4 && !a.contains(',') && !a.contains("http") {
Some(a.to_string())
} else {
None
}
})
.or_else(|| site_name_from_title(html))
.and_then(|name| {
if name.split_whitespace().count() > 6 {
None
} else {
Some(name)
}
})
}
fn site_name_from_title(html: &Html) -> Option<String> {
let title = title_element_text(html)?;
for sep in &[" | ", " - "] {
let Some(idx) = title.rfind(sep) else {
continue;
};
let before = title[..idx].trim();
let after = title[idx + sep.len()..].trim();
if after.is_empty() || before.is_empty() {
continue;
}
if after.split_whitespace().count() <= 4
&& before.split_whitespace().count() >= 2
&& before.len() >= after.len()
{
return Some(after.to_string());
}
}
None
}
fn website_name_from_graph(obj: &serde_json::Value) -> Option<String> {
let graph = obj.get("@graph")?.as_array()?;
for item in graph {
let Some(type_val) = item.get("@type") else {
continue;
};
let is_website = type_val.as_str() == Some("WebSite")
|| type_val
.as_array()
.is_some_and(|a| a.iter().any(|v| v.as_str() == Some("WebSite")));
if !is_website {
continue;
}
let Some(name) = item.get("name").and_then(|v| v.as_str()) else {
continue;
};
let trimmed = name.trim();
if !trimmed.is_empty() {
return Some(trimmed.to_string());
}
}
None
}
fn schema_graph_website_name(schema: Option<&serde_json::Value>) -> Option<String> {
let data = schema?;
if let Some(result) = website_name_from_graph(data) {
return Some(result);
}
if let serde_json::Value::Array(items) = data {
for item in items {
if let Some(result) = website_name_from_graph(item) {
return Some(result);
}
}
}
None
}
fn extract_description(html: &Html, schema: Option<&serde_json::Value>) -> Option<String> {
get_meta_content(html, "name", "description")
.or_else(|| get_meta_content(html, "property", "og:description"))
.or_else(|| get_dc_content(html, "description"))
.or_else(|| get_meta_content(html, "property", "twitter:description"))
.or_else(|| get_meta_content(html, "name", "twitter:description"))
.or_else(|| get_meta_content(html, "name", "sailthru.description"))
.or_else(|| schema_str(schema, "description"))
}
fn extract_image(html: &Html, schema: Option<&serde_json::Value>) -> Option<String> {
get_meta_content(html, "property", "og:image")
.or_else(|| get_meta_content(html, "property", "twitter:image"))
.or_else(|| get_meta_content(html, "name", "twitter:image"))
.or_else(|| get_meta_content(html, "name", "twitter:image:src"))
.or_else(|| get_meta_content(html, "name", "parsely-image-url"))
.or_else(|| get_meta_content(html, "name", "thumbnail"))
.or_else(|| schema_image(schema))
}
fn schema_image(schema: Option<&serde_json::Value>) -> Option<String> {
let image = schema?.get("image")?;
schema_image_from_value(image)
}
fn schema_image_from_value(value: &serde_json::Value) -> Option<String> {
if let Some(s) = value.as_str() {
let trimmed = s.trim();
if !trimmed.is_empty() {
return Some(trimmed.to_string());
}
}
if let Some(url) = value.get("url").and_then(|v| v.as_str()) {
let trimmed = url.trim();
if !trimmed.is_empty() {
return Some(trimmed.to_string());
}
}
if let Some(arr) = value.as_array() {
for item in arr {
if let Some(result) = schema_image_from_value(item) {
return Some(result);
}
}
}
None
}
fn extract_language(html: &Html, schema: Option<&serde_json::Value>) -> Option<String> {
html_lang(html)
.or_else(|| get_meta_content(html, "name", "content-language"))
.or_else(|| get_meta_content(html, "http-equiv", "content-language"))
.or_else(|| get_dc_content(html, "language"))
.or_else(|| get_meta_content(html, "property", "og:locale"))
.or_else(|| schema_str(schema, "inLanguage"))
.map(|s| normalize_bcp47(&s))
}
fn normalize_bcp47(lang: &str) -> String {
lang.replace('_', "-")
}
fn html_lang(html: &Html) -> Option<String> {
let Ok(sel) = Selector::parse("html") else {
return None;
};
let el = html.select(&sel).next()?;
let lang = el.value().attr("lang")?.trim();
if lang.is_empty() {
return None;
}
Some(lang.to_string())
}
fn extract_favicon(html: &Html, url: Option<&str>) -> Option<String> {
get_meta_content(html, "property", "og:image:favicon")
.or_else(|| link_icon(html, "icon", url))
.or_else(|| link_icon(html, "shortcut icon", url))
.or_else(|| favicon_fallback(url))
}
fn link_icon(html: &Html, rel: &str, base_url: Option<&str>) -> Option<String> {
let selector_str = format!("link[rel=\"{rel}\"]");
let Ok(sel) = Selector::parse(&selector_str) else {
return None;
};
let el = html.select(&sel).next()?;
let href = el.value().attr("href")?.trim();
if href.is_empty() {
return None;
}
Some(resolve_favicon_url(href, base_url))
}
fn resolve_favicon_url(href: &str, base_url: Option<&str>) -> String {
if href.starts_with("http://") || href.starts_with("https://") {
return href.to_string();
}
if let Some(base) = base_url
&& let Ok(base_parsed) = url::Url::parse(base)
&& let Ok(resolved) = base_parsed.join(href)
{
return resolved.to_string();
}
href.to_string()
}
fn favicon_fallback(url: Option<&str>) -> Option<String> {
let base = base_url(url?)?;
Some(format!("{base}/favicon.ico"))
}
fn domain_to_site_name(domain: &str) -> String {
if domain.is_empty() {
return String::new();
}
let stripped = domain
.strip_prefix("www.")
.or_else(|| {
let parts: Vec<&str> = domain.splitn(2, '.').collect();
if parts.len() == 2
&& parts[0].len() == 2
&& parts[0].chars().all(|c| c.is_ascii_lowercase())
{
Some(parts[1])
} else {
None
}
})
.unwrap_or(domain);
let name_part = stripped.split('.').next().unwrap_or(stripped);
if name_part.is_empty() {
return String::new();
}
let mut chars = name_part.chars();
match chars.next() {
Some(first) => {
let mut result = first.to_uppercase().to_string();
result.extend(chars);
result
}
None => String::new(),
}
}
fn extract_domain(url: Option<&str>) -> Option<String> {
let raw = url?;
let parsed = url::Url::parse(raw).ok()?;
let host = parsed.host_str()?;
if host.is_empty() {
None
} else {
Some(host.to_string())
}
}
fn base_url(raw: &str) -> Option<String> {
let parsed = url::Url::parse(raw).ok()?;
let host = parsed.host_str()?;
let scheme = parsed.scheme();
match parsed.port() {
Some(port) => Some(format!("{scheme}://{host}:{port}")),
None => Some(format!("{scheme}://{host}")),
}
}
fn extract_canonical_url(html: &Html, page_url: Option<&str>) -> Option<String> {
let raw = if let Ok(sel) = Selector::parse("link[rel=\"canonical\"]") {
html.select(&sel)
.next()
.and_then(|el| el.value().attr("href"))
.map(|h| h.trim().to_string())
.filter(|s| !s.is_empty())
} else {
None
};
let raw = raw.or_else(|| get_meta_content(html, "property", "og:url"));
raw.map(|href| {
if href.starts_with("http://") || href.starts_with("https://") {
href
} else if let Some(base) = page_url.and_then(|u| url::Url::parse(u).ok()) {
base.join(&href)
.map_or(href, |resolved| resolved.to_string())
} else {
href
}
})
}
fn extract_keywords(html: &Html) -> Vec<String> {
let raw = get_meta_content(html, "name", "keywords")
.or_else(|| get_dc_content(html, "subject"))
.or_else(|| get_meta_content(html, "name", "news_keywords"))
.or_else(|| get_meta_content(html, "name", "parsely-tags"));
if let Some(raw) = raw {
return raw
.split(',')
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect();
}
if let Ok(sel) = Selector::parse("meta[property=\"article:tag\"]") {
let tags: Vec<String> = html
.select(&sel)
.filter_map(|el| el.value().attr("content"))
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect();
if !tags.is_empty() {
return tags;
}
}
Vec::new()
}
fn extract_content_type(html: &Html, schema: Option<&serde_json::Value>) -> Option<String> {
get_meta_content(html, "property", "og:type")
.or_else(|| get_dc_content(html, "type"))
.or_else(|| schema_type(schema))
.filter(|s| !s.trim().is_empty())
}
fn schema_type(schema: Option<&serde_json::Value>) -> Option<String> {
let data = schema?;
if let Some(t) = data.get("@type").and_then(|v| v.as_str()) {
return Some(t.to_string());
}
if let serde_json::Value::Array(items) = data {
for item in items {
if let Some(t) = item.get("@type").and_then(|v| v.as_str()) {
if t != "WebSite" && t != "WebPage" && t != "BreadcrumbList" {
return Some(t.to_string());
}
}
}
}
None
}
#[cfg(test)]
mod tests {
use super::*;
use scraper::Html;
#[test]
fn title_from_og() {
let doc = Html::parse_document(
r#"<html><head>
<meta property="og:title" content="OG Title">
<title>Fallback</title>
</head><body></body></html>"#,
);
let m = extract_metadata(&doc, None, None);
assert_eq!(m.title.as_deref(), Some("OG Title"));
}
#[test]
fn title_suffix_inferred_as_site_name() {
let doc = Html::parse_document(
r"<html><head>
<title>Article Name | Site Name</title>
</head><body></body></html>",
);
let m = extract_metadata(&doc, None, None);
assert_eq!(m.title.as_deref(), Some("Article Name"));
assert_eq!(m.site_name.as_deref(), Some("Site Name"));
}
#[test]
fn title_stripped_with_og_site_name() {
let doc = Html::parse_document(
r#"<html><head>
<meta property="og:site_name" content="Site Name">
<title>Article Name | Site Name</title>
</head><body></body></html>"#,
);
let m = extract_metadata(&doc, None, None);
assert_eq!(m.title.as_deref(), Some("Article Name"));
}
#[test]
fn title_stripped_when_site_name_matches() {
let doc = Html::parse_document(
r#"<html><head>
<meta property="og:site_name" content="Wikipedia">
<title>Bengaluru - Wikipedia</title>
</head><body></body></html>"#,
);
let m = extract_metadata(&doc, None, None);
assert_eq!(m.title.as_deref(), Some("Bengaluru"));
}
#[test]
fn title_not_stripped_when_site_name_mismatches() {
let doc = Html::parse_document(
r#"<html><head>
<meta property="og:site_name" content="MyBlog">
<title>Part A - Part B</title>
</head><body></body></html>"#,
);
let m = extract_metadata(&doc, None, None);
assert_eq!(m.title.as_deref(), Some("Part A - Part B"));
}
#[test]
fn author_from_meta() {
let doc = Html::parse_document(
r#"<html><head>
<meta name="author" content="Jane Doe">
</head><body></body></html>"#,
);
let m = extract_metadata(&doc, None, None);
assert_eq!(m.author.as_deref(), Some("Jane Doe"));
}
#[test]
fn author_from_schema_object() {
let schema: serde_json::Value = serde_json::json!({
"author": { "name": "Schema Author" }
});
let doc = Html::parse_document("<html><body></body></html>");
let m = extract_metadata(&doc, None, Some(&schema));
assert_eq!(m.author.as_deref(), Some("Schema Author"));
}
#[test]
fn domain_extracted_from_url() {
let doc = Html::parse_document("<html><body></body></html>");
let m = extract_metadata(&doc, Some("https://example.com/page"), None);
assert_eq!(m.domain.as_deref(), Some("example.com"));
}
#[test]
fn favicon_fallback_to_root() {
let doc = Html::parse_document("<html><body></body></html>");
let m = extract_metadata(&doc, Some("https://example.com/a/b"), None);
assert_eq!(
m.favicon.as_deref(),
Some("https://example.com/favicon.ico")
);
}
#[test]
fn language_from_html_attr() {
let doc = Html::parse_document(r#"<html lang="en-US"><body></body></html>"#);
let m = extract_metadata(&doc, None, None);
assert_eq!(m.language.as_deref(), Some("en-US"));
}
#[test]
fn description_from_meta() {
let doc = Html::parse_document(
r#"<html><head>
<meta name="description" content="A page about things">
</head><body></body></html>"#,
);
let m = extract_metadata(&doc, None, None);
assert_eq!(m.description.as_deref(), Some("A page about things"));
}
#[test]
fn published_from_schema() {
let schema: serde_json::Value = serde_json::json!({
"datePublished": "2025-01-15"
});
let doc = Html::parse_document("<html><body></body></html>");
let m = extract_metadata(&doc, None, Some(&schema));
assert_eq!(m.published.as_deref(), Some("2025-01-15"));
}
#[test]
fn site_name_from_og() {
let doc = Html::parse_document(
r#"<html><head>
<meta property="og:site_name" content="My Site">
</head><body></body></html>"#,
);
let m = extract_metadata(&doc, None, None);
assert_eq!(m.site_name.as_deref(), Some("My Site"));
}
#[test]
fn image_from_schema_string() {
let schema: serde_json::Value = serde_json::json!({
"image": "https://img.example.com/photo.jpg"
});
let doc = Html::parse_document("<html><body></body></html>");
let m = extract_metadata(&doc, None, Some(&schema));
assert_eq!(
m.image.as_deref(),
Some("https://img.example.com/photo.jpg")
);
}
#[test]
fn image_from_schema_object() {
let schema: serde_json::Value = serde_json::json!({
"image": { "url": "https://img.example.com/photo.jpg" }
});
let doc = Html::parse_document("<html><body></body></html>");
let m = extract_metadata(&doc, None, Some(&schema));
assert_eq!(
m.image.as_deref(),
Some("https://img.example.com/photo.jpg")
);
}
#[test]
fn image_from_schema_array_of_strings() {
let schema: serde_json::Value = serde_json::json!({
"image": ["https://img.example.com/a.jpg", "https://img.example.com/b.jpg"]
});
let doc = Html::parse_document("<html><body></body></html>");
let m = extract_metadata(&doc, None, Some(&schema));
assert_eq!(m.image.as_deref(), Some("https://img.example.com/a.jpg"));
}
#[test]
fn image_from_schema_array_of_objects() {
let schema: serde_json::Value = serde_json::json!({
"image": [{ "url": "https://img.example.com/a.jpg" }]
});
let doc = Html::parse_document("<html><body></body></html>");
let m = extract_metadata(&doc, None, Some(&schema));
assert_eq!(m.image.as_deref(), Some("https://img.example.com/a.jpg"));
}
#[test]
fn empty_metadata_for_blank_doc() {
let doc = Html::parse_document("<html><body></body></html>");
let m = extract_metadata(&doc, None, None);
assert!(m.title.is_none());
assert!(m.author.is_none());
assert!(m.published.is_none());
assert!(m.domain.is_none());
}
#[test]
fn clean_title_strips_trailing_site_name_pipe() {
let result = clean_title("Article Title | Site Name", "Site Name", None, None);
assert_eq!(result, "Article Title");
}
#[test]
fn clean_title_strips_trailing_site_name_dash() {
let result = clean_title("Article Title - Site Name", "Site Name", None, None);
assert_eq!(result, "Article Title");
}
#[test]
fn clean_title_strips_leading_site_name() {
let result = clean_title("Site Name | Article Title", "Site Name", None, None);
assert_eq!(result, "Article Title");
}
#[test]
fn clean_title_unchanged_without_separator() {
let result = clean_title("Title With No Separator", "Site Name", None, None);
assert_eq!(result, "Title With No Separator");
}
#[test]
fn clean_title_keeps_longer_part() {
let result = clean_title("Short | Very Long Article Title Here", "Short", None, None);
assert_eq!(result, "Very Long Article Title Here");
}
#[test]
fn schema_str_from_array() {
let schema: serde_json::Value = serde_json::json!([
{"@type": "WebPage"},
{"@type": "Article", "headline": "Array Title", "datePublished": "2025-06-01"}
]);
let doc = Html::parse_document("<html><body></body></html>");
let m = extract_metadata(&doc, None, Some(&schema));
assert_eq!(m.title.as_deref(), Some("Array Title"));
assert_eq!(m.published.as_deref(), Some("2025-06-01"));
}
#[test]
fn author_from_schema_array() {
let schema: serde_json::Value = serde_json::json!({
"author": [{"name": "Alice"}, {"name": "Bob"}]
});
let doc = Html::parse_document("<html><body></body></html>");
let m = extract_metadata(&doc, None, Some(&schema));
assert_eq!(m.author.as_deref(), Some("Alice, Bob"));
}
#[test]
fn author_from_array_schema_item() {
let schema: serde_json::Value = serde_json::json!([
{"@type": "WebPage"},
{"@type": "Article", "author": {"name": "Charlie"}}
]);
let doc = Html::parse_document("<html><body></body></html>");
let m = extract_metadata(&doc, None, Some(&schema));
assert_eq!(m.author.as_deref(), Some("Charlie"));
}
#[test]
fn graph_website_name_from_array_schema() {
let schema: serde_json::Value = serde_json::json!([
{
"@graph": [
{"@type": "WebSite", "name": "My Blog"}
]
}
]);
let doc = Html::parse_document("<html><body></body></html>");
let m = extract_metadata(&doc, None, Some(&schema));
assert_eq!(m.site_name.as_deref(), Some("My Blog"));
}
}
#[cfg(test)]
mod dc_tests {
use super::*;
use scraper::Html;
#[test]
fn dc_title() {
let doc = Html::parse_document(
r#"<html><head><meta name="DC.title" content="Dublin Core Title"></head><body></body></html>"#,
);
let m = extract_metadata(&doc, None, None);
assert_eq!(m.title.as_deref(), Some("Dublin Core Title"));
}
#[test]
fn dcterm_title() {
let doc = Html::parse_document(
r#"<html><head><meta name="dcterm:title" content="DCTerm Title"></head><body></body></html>"#,
);
let m = extract_metadata(&doc, None, None);
assert_eq!(m.title.as_deref(), Some("DCTerm Title"));
}
#[test]
fn dc_creator_as_author() {
let doc = Html::parse_document(
r#"<html><head><meta name="DC.creator" content="Jane Doe"></head><body></body></html>"#,
);
let m = extract_metadata(&doc, None, None);
assert_eq!(m.author.as_deref(), Some("Jane Doe"));
}
#[test]
fn dc_date_as_published() {
let doc = Html::parse_document(
r#"<html><head><meta name="DC.date" content="2025-06-15"></head><body></body></html>"#,
);
let m = extract_metadata(&doc, None, None);
assert_eq!(m.published.as_deref(), Some("2025-06-15"));
}
#[test]
fn dcterms_created_as_published() {
let doc = Html::parse_document(
r#"<html><head><meta name="DCTERMS.created" content="2025-01-01T00:00:00Z"></head><body></body></html>"#,
);
let m = extract_metadata(&doc, None, None);
assert_eq!(m.published.as_deref(), Some("2025-01-01T00:00:00Z"));
}
#[test]
fn dc_description() {
let doc = Html::parse_document(
r#"<html><head><meta name="DC.description" content="A Dublin Core description"></head><body></body></html>"#,
);
let m = extract_metadata(&doc, None, None);
assert_eq!(m.description.as_deref(), Some("A Dublin Core description"));
}
#[test]
fn dc_language() {
let doc = Html::parse_document(
r#"<html><head><meta name="DC.language" content="en-US"></head><body></body></html>"#,
);
let m = extract_metadata(&doc, None, None);
assert_eq!(m.language.as_deref(), Some("en-US"));
}
#[test]
fn og_takes_precedence_over_dc() {
let doc = Html::parse_document(
r#"<html><head>
<meta property="og:title" content="OG Title">
<meta name="DC.title" content="DC Title">
</head><body></body></html>"#,
);
let m = extract_metadata(&doc, None, None);
assert_eq!(m.title.as_deref(), Some("OG Title"));
}
#[test]
fn parsely_author() {
let doc = Html::parse_document(
r#"<html><head><meta name="parsely-author" content="Parsely Author"></head><body></body></html>"#,
);
let m = extract_metadata(&doc, None, None);
assert_eq!(m.author.as_deref(), Some("Parsely Author"));
}
#[test]
fn parsely_pub_date() {
let doc = Html::parse_document(
r#"<html><head><meta name="parsely-pub-date" content="2025-03-20"></head><body></body></html>"#,
);
let m = extract_metadata(&doc, None, None);
assert_eq!(m.published.as_deref(), Some("2025-03-20"));
}
}
#[cfg(test)]
mod modified_tests {
use super::*;
use scraper::Html;
#[test]
fn modified_from_article_modified_time() {
let doc = Html::parse_document(
r#"<html><head><meta property="article:modified_time" content="2025-09-01T12:00:00Z"></head><body></body></html>"#,
);
let m = extract_metadata(&doc, None, None);
assert_eq!(m.modified.as_deref(), Some("2025-09-01T12:00:00Z"));
}
#[test]
fn modified_from_og_updated_time() {
let doc = Html::parse_document(
r#"<html><head><meta property="og:updated_time" content="2025-08-15"></head><body></body></html>"#,
);
let m = extract_metadata(&doc, None, None);
assert_eq!(m.modified.as_deref(), Some("2025-08-15"));
}
#[test]
fn modified_from_dcterms() {
let doc = Html::parse_document(
r#"<html><head><meta name="DCTERMS.modified" content="2025-07-20"></head><body></body></html>"#,
);
let m = extract_metadata(&doc, None, None);
assert_eq!(m.modified.as_deref(), Some("2025-07-20"));
}
#[test]
fn modified_from_schema_date_modified() {
let schema: serde_json::Value = serde_json::json!({
"@type": "Article",
"dateModified": "2025-06-10T08:00:00Z"
});
let doc = Html::parse_document("<html><body></body></html>");
let m = extract_metadata(&doc, None, Some(&schema));
assert_eq!(m.modified.as_deref(), Some("2025-06-10T08:00:00Z"));
}
#[test]
fn modified_empty_when_absent() {
let doc = Html::parse_document("<html><body></body></html>");
let m = extract_metadata(&doc, None, None);
assert!(m.modified.is_none());
}
#[test]
fn dc_publisher_as_site_name() {
let doc = Html::parse_document(
r#"<html><head><meta name="DC.publisher" content="Example Press"></head><body></body></html>"#,
);
let m = extract_metadata(&doc, None, None);
assert_eq!(m.site_name.as_deref(), Some("Example Press"));
}
}
#[cfg(test)]
mod new_fields_tests {
use super::*;
use scraper::Html;
#[test]
fn canonical_url_from_link_rel() {
let doc = Html::parse_document(
r#"<html><head><link rel="canonical" href="https://example.com/article"></head><body></body></html>"#,
);
let m = extract_metadata(&doc, None, None);
assert_eq!(
m.canonical_url.as_deref(),
Some("https://example.com/article")
);
}
#[test]
fn canonical_url_relative_resolved() {
let doc = Html::parse_document(
r#"<html><head><link rel="canonical" href="/article"></head><body></body></html>"#,
);
let m = extract_metadata(&doc, Some("https://example.com/page"), None);
assert_eq!(
m.canonical_url.as_deref(),
Some("https://example.com/article")
);
}
#[test]
fn canonical_url_relative_no_base() {
let doc = Html::parse_document(
r#"<html><head><link rel="canonical" href="/article"></head><body></body></html>"#,
);
let m = extract_metadata(&doc, None, None);
assert_eq!(m.canonical_url.as_deref(), Some("/article"));
}
#[test]
fn canonical_url_from_og_url() {
let doc = Html::parse_document(
r#"<html><head><meta property="og:url" content="https://example.com/page"></head><body></body></html>"#,
);
let m = extract_metadata(&doc, None, None);
assert_eq!(m.canonical_url.as_deref(), Some("https://example.com/page"));
}
#[test]
fn keywords_from_meta() {
let doc = Html::parse_document(
r#"<html><head><meta name="keywords" content="rust, programming, web"></head><body></body></html>"#,
);
let m = extract_metadata(&doc, None, None);
assert_eq!(m.keywords, vec!["rust", "programming", "web"]);
}
#[test]
fn keywords_from_dc_subject() {
let doc = Html::parse_document(
r#"<html><head><meta name="DC.subject" content="science, biology"></head><body></body></html>"#,
);
let m = extract_metadata(&doc, None, None);
assert_eq!(m.keywords, vec!["science", "biology"]);
}
#[test]
fn keywords_from_article_tag() {
let doc = Html::parse_document(
r#"<html><head>
<meta property="article:tag" content="rust">
<meta property="article:tag" content="web">
</head><body></body></html>"#,
);
let m = extract_metadata(&doc, None, None);
assert_eq!(m.keywords, vec!["rust", "web"]);
}
#[test]
fn keywords_empty_when_absent() {
let doc = Html::parse_document("<html><body></body></html>");
let m = extract_metadata(&doc, None, None);
assert!(m.keywords.is_empty());
}
#[test]
fn content_type_from_og_type() {
let doc = Html::parse_document(
r#"<html><head><meta property="og:type" content="article"></head><body></body></html>"#,
);
let m = extract_metadata(&doc, None, None);
assert_eq!(m.content_type.as_deref(), Some("article"));
}
#[test]
fn content_type_from_schema() {
let schema: serde_json::Value = serde_json::json!({"@type": "NewsArticle"});
let doc = Html::parse_document("<html><body></body></html>");
let m = extract_metadata(&doc, None, Some(&schema));
assert_eq!(m.content_type.as_deref(), Some("NewsArticle"));
}
#[test]
fn content_type_skips_generic_schema_types() {
let schema: serde_json::Value = serde_json::json!([
{"@type": "WebSite", "name": "Example"},
{"@type": "Article", "headline": "Test"}
]);
let doc = Html::parse_document("<html><body></body></html>");
let m = extract_metadata(&doc, None, Some(&schema));
assert_eq!(m.content_type.as_deref(), Some("Article"));
}
#[test]
fn image_from_twitter_image_src() {
let doc = Html::parse_document(
r#"<html><head><meta name="twitter:image:src" content="https://img.example.com/photo.jpg"></head><body></body></html>"#,
);
let m = extract_metadata(&doc, None, None);
assert_eq!(
m.image.as_deref(),
Some("https://img.example.com/photo.jpg")
);
}
#[test]
fn author_from_article_author() {
let doc = Html::parse_document(
r#"<html><head><meta property="article:author" content="Jane Smith"></head><body></body></html>"#,
);
let m = extract_metadata(&doc, None, None);
assert_eq!(m.author.as_deref(), Some("Jane Smith"));
}
}