use crate::constants::REGEXPS;
use crate::utils;
use once_cell::sync::Lazy;
use scraper::node::Node;
use scraper::{ElementRef, Html, Selector};
use serde_json::Value;
use std::borrow::Cow;
use std::collections::HashMap;
#[derive(Debug, Clone, Default)]
pub struct Metadata {
pub title: Option<String>,
pub byline: Option<String>,
pub excerpt: Option<String>,
pub site_name: Option<String>,
pub published_time: Option<String>,
pub lang: Option<String>,
pub image: Option<String>,
}
pub fn get_json_ld(document: &Html) -> Metadata {
let mut metadata = Metadata::default();
let script_selector = Selector::parse("script[type='application/ld+json']").unwrap();
let schema_regex = regex::Regex::new(r"^https?://schema\.org/?$").unwrap();
for script in document.select(&script_selector) {
let content = script.text().collect::<String>();
let content = content
.trim()
.trim_start_matches("<![CDATA[")
.trim_end_matches("]]>")
.trim();
if let Ok(mut parsed) = serde_json::from_str::<Value>(content) {
if let Some(arr) = parsed.as_array() {
if let Some(article) = arr.iter().find(|item| {
if let Some(type_val) = item.get("@type") {
if let Some(type_str) = type_val.as_str() {
return REGEXPS.json_ld_article_types.is_match(type_str);
}
}
false
}) {
parsed = article.clone();
} else {
continue;
}
}
let has_schema_context = if let Some(context) = parsed.get("@context") {
if let Some(ctx_str) = context.as_str() {
schema_regex.is_match(ctx_str)
} else if let Some(ctx_obj) = context.as_object() {
if let Some(vocab) = ctx_obj.get("@vocab").and_then(|v| v.as_str()) {
schema_regex.is_match(vocab)
} else {
false
}
} else {
false
}
} else {
false
};
if !has_schema_context {
continue;
}
if parsed.get("@type").is_none() {
if let Some(graph) = parsed.get("@graph").and_then(|g| g.as_array()) {
if let Some(article) = graph.iter().find(|item| {
if let Some(type_val) = item.get("@type") {
if let Some(type_str) = type_val.as_str() {
return REGEXPS.json_ld_article_types.is_match(type_str);
}
}
false
}) {
parsed = article.clone();
}
}
}
if let Some(type_val) = parsed.get("@type") {
if let Some(type_str) = type_val.as_str() {
if !REGEXPS.json_ld_article_types.is_match(type_str) {
continue;
}
} else {
continue;
}
} else {
continue;
}
let name = parsed.get("name").and_then(|v| v.as_str());
let headline = parsed.get("headline").and_then(|v| v.as_str());
let publisher_name = parsed
.get("publisher")
.and_then(|p| p.get("name"))
.and_then(|n| n.as_str());
if metadata.title.is_none() {
if let (Some(name_str), Some(pub_name)) = (name, publisher_name) {
if name_str.trim() == pub_name.trim() {
if let Some(headline_str) = headline {
metadata.title = Some(headline_str.trim().to_string());
}
} else {
metadata.title = Some(name_str.trim().to_string());
}
} else if let Some(name_str) = name {
metadata.title = Some(name_str.trim().to_string());
} else if let Some(headline_str) = headline {
metadata.title = Some(headline_str.trim().to_string());
}
}
if metadata.byline.is_none() {
if let Some(author) = parsed.get("author") {
if let Some(author_name) = author.get("name").and_then(|v| v.as_str()) {
metadata.byline = Some(author_name.trim().to_string());
} else if let Some(authors) = author.as_array() {
let names: Vec<String> = authors
.iter()
.filter_map(|a| a.get("name").and_then(|n| n.as_str()))
.map(|n| n.trim().to_string())
.collect();
if !names.is_empty() {
metadata.byline = Some(names.join(", "));
}
}
}
}
if metadata.excerpt.is_none() {
if let Some(description) = parsed.get("description").and_then(|v| v.as_str()) {
metadata.excerpt = Some(description.trim().to_string());
}
}
if metadata.site_name.is_none() {
if let Some(publisher) = parsed.get("publisher") {
if let Some(pub_name) = publisher.get("name").and_then(|v| v.as_str()) {
metadata.site_name = Some(pub_name.trim().to_string());
}
}
}
if metadata.published_time.is_none() {
if let Some(date_published) = parsed.get("datePublished").and_then(|v| v.as_str()) {
metadata.published_time = Some(date_published.trim().to_string());
}
}
if metadata.image.is_none() {
metadata.image = extract_json_ld_image(&parsed);
}
}
}
metadata
}
fn extract_json_ld_image(parsed: &Value) -> Option<String> {
if let Some(image) = parsed.get("image") {
if let Some(url) = image.as_str() {
let trimmed = url.trim();
if !trimmed.is_empty() {
return Some(trimmed.to_string());
}
}
if let Some(url) = image.get("url").and_then(|v| v.as_str()) {
let trimmed = url.trim();
if !trimmed.is_empty() {
return Some(trimmed.to_string());
}
}
if let Some(id) = image.get("@id").and_then(|v| v.as_str()) {
let trimmed = id.trim();
if !trimmed.is_empty()
&& (trimmed.starts_with("http://") || trimmed.starts_with("https://"))
{
return Some(trimmed.to_string());
}
}
if let Some(arr) = image.as_array() {
for img in arr {
if let Some(url) = img.as_str() {
let trimmed = url.trim();
if !trimmed.is_empty() {
return Some(trimmed.to_string());
}
}
if let Some(url) = img.get("url").and_then(|v| v.as_str()) {
let trimmed = url.trim();
if !trimmed.is_empty() {
return Some(trimmed.to_string());
}
}
}
}
}
if let Some(thumbnail) = parsed.get("thumbnailUrl").and_then(|v| v.as_str()) {
let trimmed = thumbnail.trim();
if !trimmed.is_empty() {
return Some(trimmed.to_string());
}
}
None
}
pub fn get_article_metadata(document: &Html, json_ld: Metadata) -> Metadata {
let mut values: HashMap<String, String> = HashMap::new();
let property_pattern = regex::Regex::new(
r"(?i)\s*(article|dc|dcterm|og|twitter)\s*:\s*(author|creator|description|published_time|title|site_name|image:url|image:secure_url|image$)\s*"
).unwrap();
let name_pattern = regex::Regex::new(
r"(?i)^\s*(?:(?:article|dc|dcterm|og|twitter|parsely|weibo:(?:article|webpage))\s*[-\.:]\s*)?(author|author_name|creator|pub-date|description|title|site_name|image|thumbnail)\s*$"
).unwrap();
let meta_selector = Selector::parse("meta").unwrap();
for meta in document.select(&meta_selector) {
let element_name = meta.value().attr("name");
let element_property = meta.value().attr("property");
let content = meta.value().attr("content");
if content.is_none() || content.unwrap().is_empty() {
continue;
}
let content = content.unwrap();
let mut matched_name: Option<String> = None;
if let Some(property) = element_property {
for prop in property.split_whitespace() {
if let Some(mat) = property_pattern.find(prop) {
let key = prop[mat.start()..mat.end()]
.to_lowercase()
.replace(char::is_whitespace, "");
values.insert(key, content.trim().to_string());
matched_name = Some(property.to_string());
}
}
}
if matched_name.is_none() {
if let Some(name) = element_name {
if name_pattern.is_match(name) {
let normalized = name
.to_lowercase()
.replace(char::is_whitespace, "")
.replace('.', ":");
values.insert(normalized, content.trim().to_string());
}
}
}
}
let mut metadata = Metadata {
title: json_ld.title.or_else(|| {
values
.get("dc:title")
.or_else(|| values.get("dcterm:title"))
.or_else(|| values.get("og:title"))
.or_else(|| values.get("weibo:article:title"))
.or_else(|| values.get("weibo:webpage:title"))
.or_else(|| values.get("title"))
.or_else(|| values.get("twitter:title"))
.or_else(|| values.get("parsely-title"))
.cloned()
}),
..Default::default()
};
if metadata.title.is_none() {
metadata.title = extract_title_from_document(document);
}
if metadata.title.is_none() {
metadata.title = Some(String::new());
}
let article_author = values
.get("article:author")
.or_else(|| values.get("article:author_name"))
.filter(|v| !utils::is_url(v))
.cloned();
let dom_byline = extract_byline_from_document(document);
let mut meta_byline = json_ld.byline.or_else(|| {
values
.get("dc:creator")
.or_else(|| values.get("dcterm:creator"))
.or_else(|| values.get("author"))
.or_else(|| values.get("parsely-author"))
.or(article_author.as_ref())
.cloned()
});
if let Some(dom_value) = dom_byline.clone() {
let dom_text = dom_value.text.clone();
match &meta_byline {
Some(existing) => {
if should_prefer_dom_byline(existing, &dom_text, dom_value.confidence) {
meta_byline = Some(dom_text);
}
}
None => meta_byline = Some(dom_text),
}
}
metadata.byline = meta_byline;
metadata.excerpt = json_ld.excerpt.or_else(|| {
values
.get("dc:description")
.or_else(|| values.get("dcterm:description"))
.or_else(|| values.get("og:description"))
.or_else(|| values.get("weibo:article:description"))
.or_else(|| values.get("weibo:webpage:description"))
.or_else(|| values.get("description"))
.or_else(|| values.get("twitter:description"))
.cloned()
});
metadata.site_name = json_ld
.site_name
.or_else(|| values.get("og:site_name").cloned());
metadata.published_time = json_ld.published_time.or_else(|| {
values
.get("article:published_time")
.or_else(|| values.get("parsely-pub-date"))
.cloned()
});
metadata.image = json_ld.image.or_else(|| {
values
.get("og:image:secure_url")
.or_else(|| values.get("og:image:url"))
.or_else(|| values.get("og:image"))
.or_else(|| values.get("twitter:image"))
.or_else(|| values.get("thumbnail"))
.or_else(|| values.get("image"))
.cloned()
});
if metadata.image.is_none() {
metadata.image = extract_image_from_document(document);
}
metadata.lang = extract_language_from_document(document);
metadata.title = metadata.title.map(|t| utils::unescape_html_entities(&t));
metadata.byline = metadata
.byline
.map(|b| utils::unescape_html_entities(&b))
.and_then(|b| utils::clean_byline_text(&b));
metadata.excerpt = metadata
.excerpt
.map(|e| utils::unescape_html_entities(&e))
.and_then(|e| {
let trimmed = e.trim();
if trimmed.is_empty() {
return None;
}
if utils::looks_like_bracket_menu(trimmed) {
return None;
}
Some(e)
});
metadata.site_name = metadata
.site_name
.map(|s| utils::unescape_html_entities(&s));
if let (Some(existing), Some(dom_value)) = (metadata.byline.clone(), dom_byline.clone()) {
if should_prefer_dom_byline(&existing, &dom_value.text, dom_value.confidence) {
metadata.byline =
utils::clean_byline_text(&dom_value.text).or_else(|| Some(dom_value.text.clone()));
}
}
#[cfg(test)]
{
if metadata.title.as_deref() == Some("Un troisième Français mort dans le séisme au Népal")
{
eprintln!("herald dom_byline inside metadata: {:?}", dom_byline);
eprintln!("herald existing after clean: {:?}", metadata.byline);
}
}
if let Some(caps_candidate) = extract_standfirst_caps_byline(document) {
match &metadata.byline {
Some(existing) => {
if should_prefer_caps_standfirst(existing, &caps_candidate) {
metadata.byline = Some(caps_candidate);
}
}
None => metadata.byline = Some(caps_candidate),
}
}
if let (Some(byline), Some(site_name)) = (metadata.byline.clone(), metadata.site_name.clone()) {
if utils::is_byline_redundant_with_site_name(&byline, &site_name) {
metadata.byline = None;
}
}
metadata.published_time = metadata
.published_time
.map(|p| utils::unescape_html_entities(&p));
metadata.image = metadata.image.and_then(|img| {
let trimmed = img.trim();
if trimmed.is_empty() {
return None;
}
Some(utils::unescape_html_entities(trimmed))
});
metadata
}
fn extract_image_from_document(document: &Html) -> Option<String> {
if let Ok(selector) = Selector::parse("link[rel='image_src']") {
if let Some(link) = document.select(&selector).next() {
if let Some(href) = link.value().attr("href") {
let trimmed = href.trim();
if !trimmed.is_empty() {
return Some(trimmed.to_string());
}
}
}
}
if let Ok(selector) = Selector::parse("[itemprop='image']") {
for elem in document.select(&selector) {
if let Some(content) = elem.value().attr("content") {
let trimmed = content.trim();
if !trimmed.is_empty() {
return Some(trimmed.to_string());
}
}
if let Some(src) = elem.value().attr("src") {
let trimmed = src.trim();
if !trimmed.is_empty() {
return Some(trimmed.to_string());
}
}
if let Some(href) = elem.value().attr("href") {
let trimmed = href.trim();
if !trimmed.is_empty() {
return Some(trimmed.to_string());
}
}
}
}
None
}
#[derive(Clone, Debug, PartialEq, Eq)]
struct DomBylineCandidate {
text: String,
confidence: DomBylineConfidence,
}
impl DomBylineCandidate {
fn new(text: String, confidence: DomBylineConfidence) -> Self {
Self { text, confidence }
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum DomBylineConfidence {
High,
Medium,
Low,
}
fn extract_byline_from_document(document: &Html) -> Option<DomBylineCandidate> {
use crate::scoring;
let mut fallback_candidate: Option<DomBylineCandidate> = None;
if let Some(candidate) = extract_standfirst_caps_byline(document) {
return Some(DomBylineCandidate::new(
candidate,
DomBylineConfidence::High,
));
}
if let Ok(author_link_selector) = Selector::parse("a[rel~='author']") {
for link in document.select(&author_link_selector) {
if is_ignorable_byline_context(&link) {
continue;
}
if is_noise_byline_context(&link) {
continue;
}
if let Some(parent_text) = parent_byline_text(&link) {
return Some(DomBylineCandidate::new(
parent_text,
DomBylineConfidence::High,
));
}
let text = collect_byline_candidate_text(link).trim().to_string();
if !text.is_empty() {
let class = link.value().attr("class").unwrap_or("");
let id = link.value().attr("id").unwrap_or("");
let rel_attr = link.value().attr("rel").unwrap_or("");
let match_string = format!("{class} {id}");
let has_author_rel = rel_attr
.split_whitespace()
.any(|rel| rel.eq_ignore_ascii_case("author"));
if has_author_rel || scoring::is_valid_byline(link, &match_string) {
match utils::clean_byline_text_with_reason(&text) {
utils::CleanBylineOutcome::Accepted(cleaned) => {
return Some(DomBylineCandidate::new(
cleaned,
DomBylineConfidence::High,
))
}
utils::CleanBylineOutcome::DroppedOrgCredit => return None,
utils::CleanBylineOutcome::Dropped => {}
}
}
}
}
}
if let Ok(itemprop_selector) = Selector::parse("[itemprop~='author']") {
for elem in document.select(&itemprop_selector) {
if is_ignorable_byline_context(&elem) {
continue;
}
if is_noise_byline_context(&elem) {
continue;
}
if let Some(parent_text) = parent_byline_text(&elem) {
return Some(DomBylineCandidate::new(
parent_text,
DomBylineConfidence::High,
));
}
let text = collect_byline_candidate_text(elem).trim().to_string();
if !text.is_empty() {
let class = elem.value().attr("class").unwrap_or("");
let id = elem.value().attr("id").unwrap_or("");
let itemprop = elem.value().attr("itemprop").unwrap_or("");
let match_string = format!("{class} {id}");
let has_author_itemprop = itemprop
.split_whitespace()
.any(|prop| prop.eq_ignore_ascii_case("author"));
if has_author_itemprop || scoring::is_valid_byline(elem, &match_string) {
match utils::clean_byline_text_with_reason(&text) {
utils::CleanBylineOutcome::Accepted(cleaned) => {
return Some(DomBylineCandidate::new(
cleaned,
DomBylineConfidence::High,
))
}
utils::CleanBylineOutcome::DroppedOrgCredit => return None,
utils::CleanBylineOutcome::Dropped => {}
}
}
}
}
}
let byline_patterns = [
".byline",
".pb-byline",
".author",
".by",
".writer",
".article-author",
".post-author",
".entry-author",
"#byline",
"#author",
"[class*='author']",
"[class*='byline']",
];
for pattern in &byline_patterns {
if let Ok(selector) = Selector::parse(pattern) {
for elem in document.select(&selector) {
if !element_has_byline_keyword(&elem) && is_ignorable_byline_context(&elem) {
continue;
}
if !element_has_byline_keyword(&elem) && is_noise_byline_context(&elem) {
continue;
}
let text = collect_byline_candidate_text(elem).trim().to_string();
let text_is_caps = looks_like_caps_author(&text);
if text.is_empty() || text.len() > 100 {
continue;
}
let class = elem.value().attr("class").unwrap_or("");
let id = elem.value().attr("id").unwrap_or("");
let match_string = format!("{class} {id}");
if scoring::is_valid_byline(elem, &match_string)
|| utils::looks_like_byline(&text)
|| text_is_caps
{
let confidence = if element_has_explicit_byline_marker(&elem) {
DomBylineConfidence::High
} else {
DomBylineConfidence::Medium
};
match utils::clean_byline_text_with_reason(&text) {
utils::CleanBylineOutcome::Accepted(cleaned) => {
let candidate = DomBylineCandidate::new(cleaned, confidence);
if is_priority_dom_candidate(&candidate, text_is_caps) {
return Some(candidate);
} else if fallback_candidate.is_none() {
fallback_candidate = Some(candidate);
}
}
utils::CleanBylineOutcome::DroppedOrgCredit => return None,
utils::CleanBylineOutcome::Dropped => {}
}
}
}
}
}
if let Ok(selector) = Selector::parse("[class], [id]") {
for elem in document.select(&selector) {
if is_ignorable_byline_context(&elem) {
continue;
}
if is_noise_byline_context(&elem) {
continue;
}
let class = elem.value().attr("class").unwrap_or("");
let id = elem.value().attr("id").unwrap_or("");
let class_lower = class.to_lowercase();
let id_lower = id.to_lowercase();
if !(class_lower.contains("byline")
|| class_lower.contains("author")
|| class_lower.contains("credit")
|| id_lower.contains("byline")
|| id_lower.contains("author"))
{
continue;
}
let text = collect_byline_candidate_text(elem).trim().to_string();
if text.is_empty() || text.len() > 120 {
continue;
}
let text_is_caps = looks_like_caps_author(&text);
let match_string = format!("{class} {id}");
if scoring::is_valid_byline(elem, &match_string)
|| utils::looks_like_byline(&text)
|| text_is_caps
{
match utils::clean_byline_text_with_reason(&text) {
utils::CleanBylineOutcome::Accepted(cleaned) => {
let candidate =
DomBylineCandidate::new(cleaned, DomBylineConfidence::Medium);
if is_priority_dom_candidate(&candidate, text_is_caps) {
return Some(candidate);
} else if fallback_candidate.is_none() {
fallback_candidate = Some(candidate);
}
}
utils::CleanBylineOutcome::DroppedOrgCredit => continue,
utils::CleanBylineOutcome::Dropped => {}
}
}
}
}
if let Ok(address_selector) = Selector::parse("address") {
for elem in document.select(&address_selector) {
if is_ignorable_byline_context(&elem) {
continue;
}
if is_noise_byline_context(&elem) {
continue;
}
let text = collect_byline_candidate_text(elem).trim().to_string();
if text.is_empty() || text.len() > 100 {
continue;
}
let text_is_caps = looks_like_caps_author(&text);
if utils::looks_like_byline(&text)
|| scoring::is_valid_byline(elem, &text)
|| text_is_caps
{
match utils::clean_byline_text_with_reason(&text) {
utils::CleanBylineOutcome::Accepted(cleaned) => {
let candidate = DomBylineCandidate::new(cleaned, DomBylineConfidence::Low);
if is_priority_dom_candidate(&candidate, text_is_caps) {
return Some(candidate);
} else if fallback_candidate.is_none() {
fallback_candidate = Some(candidate);
}
}
utils::CleanBylineOutcome::DroppedOrgCredit => continue,
utils::CleanBylineOutcome::Dropped => {}
}
}
}
}
if let Ok(selector) = Selector::parse("p, div, span") {
for elem in document.select(&selector) {
if is_ignorable_byline_context(&elem) {
continue;
}
if is_noise_byline_context(&elem) {
continue;
}
let text = collect_byline_candidate_text(elem).trim().to_string();
if text.is_empty() || text.len() > 120 {
continue;
}
if utils::looks_like_dateline(&text) {
continue;
}
let text_is_caps = looks_like_caps_author(&text);
if utils::looks_like_byline(&text) || text_is_caps {
match utils::clean_byline_text_with_reason(&text) {
utils::CleanBylineOutcome::Accepted(cleaned) => {
let candidate = DomBylineCandidate::new(cleaned, DomBylineConfidence::Low);
if is_priority_dom_candidate(&candidate, text_is_caps) {
return Some(candidate);
} else if fallback_candidate.is_none() {
fallback_candidate = Some(candidate);
}
}
utils::CleanBylineOutcome::DroppedOrgCredit => return None,
utils::CleanBylineOutcome::Dropped => {}
}
}
}
}
if let Some(candidate) = fallback_candidate {
return Some(candidate);
}
None
}
fn extract_standfirst_caps_byline(document: &Html) -> Option<String> {
const SELECTORS: [&str; 2] = ["em.byline", "[class*='byline']"];
const STANDFIRST_KEYWORDS: [&str; 1] = ["standfirst"];
for pattern in &SELECTORS {
if let Ok(selector) = Selector::parse(pattern) {
for elem in document.select(&selector) {
if !ancestor_has_keyword(&elem, &STANDFIRST_KEYWORDS, 5) {
continue;
}
if is_ignorable_byline_context(&elem) || is_noise_byline_context(&elem) {
continue;
}
let text = collect_byline_candidate_text(elem).trim().to_string();
if text.is_empty() || text.len() > 80 {
continue;
}
if !looks_like_caps_author(&text) {
continue;
}
match utils::clean_byline_text_with_reason(&text) {
utils::CleanBylineOutcome::Accepted(cleaned) => return Some(cleaned),
utils::CleanBylineOutcome::DroppedOrgCredit
| utils::CleanBylineOutcome::Dropped => continue,
}
}
}
}
None
}
fn build_byline_text(element: &ElementRef) -> String {
fn append_children_text(element: &ElementRef, out: &mut String) {
for child in element.children() {
match child.value() {
Node::Text(text) => {
let mut text_slice: &str = text.as_ref();
if out.ends_with('\n') && text_slice.starts_with('\n') {
text_slice = &text_slice[1..];
}
if out.ends_with('\n') {
let adjusted = strip_intermediate_newline(text_slice);
out.push_str(&adjusted);
} else {
out.push_str(text_slice);
}
}
Node::Element(data) => {
if data.name().eq_ignore_ascii_case("br") {
out.push('\n');
}
if let Some(child_el) = ElementRef::wrap(child) {
append_children_text(&child_el, out);
}
}
_ => {}
}
}
}
let mut buffer = String::new();
append_children_text(element, &mut buffer);
buffer
}
fn strip_intermediate_newline(text: &str) -> Cow<'_, str> {
let bytes = text.as_bytes();
let mut i = 0;
while i < bytes.len() && bytes[i].is_ascii_whitespace() && bytes[i] != b'\n' {
i += 1;
}
if i < bytes.len() && bytes[i] == b'\n' {
let mut owned = String::with_capacity(text.len() - 1);
owned.push_str(&text[..i]);
owned.push_str(&text[i + 1..]);
Cow::Owned(owned)
} else {
Cow::Borrowed(text)
}
}
fn collect_byline_candidate_text(element: ElementRef) -> String {
let raw_text = build_byline_text(&element);
if let Some(names) = collect_child_author_names(&element) {
if should_prefer_child_names(&element, &raw_text, &names) {
return names.join(", ");
}
}
raw_text
}
static ITEMPROP_NAME_SELECTOR: Lazy<Selector> =
Lazy::new(|| Selector::parse("[itemprop='name'], [itemprop~='name']").unwrap());
fn collect_child_author_names(element: &ElementRef) -> Option<Vec<String>> {
static ANCHOR_SELECTOR: Lazy<Selector> =
Lazy::new(|| Selector::parse("a").expect("valid anchor selector"));
fn push_unique(names: &mut Vec<String>, candidate: String) {
if !names
.iter()
.any(|existing| existing.eq_ignore_ascii_case(&candidate))
{
names.push(candidate);
}
}
let mut names = Vec::new();
for child in element.select(&ITEMPROP_NAME_SELECTOR) {
let text = child.text().collect::<String>().trim().to_string();
if !text.is_empty() {
push_unique(&mut names, text);
}
}
for anchor in element.select(&ANCHOR_SELECTOR) {
let text = anchor.text().collect::<String>().trim().to_string();
if text.is_empty() || text.contains('@') || !utils::looks_like_author_name(&text) {
continue;
}
if let Some(href) = anchor.value().attr("href") {
let href_lower = href.to_lowercase();
if href_lower.starts_with("mailto:")
|| href_lower.contains("twitter.com")
|| href_lower.contains("facebook.com")
|| href_lower.contains("linkedin.com")
{
continue;
}
}
push_unique(&mut names, text);
}
(!names.is_empty()).then_some(names)
}
fn element_has_semantic_name(element: &ElementRef) -> bool {
if let Some(itemprop) = element.value().attr("itemprop") {
if itemprop
.split_whitespace()
.any(|prop| prop.eq_ignore_ascii_case("name"))
{
return true;
}
}
element.select(&ITEMPROP_NAME_SELECTOR).next().is_some()
}
fn should_prefer_child_names(element: &ElementRef, raw_text: &str, names: &[String]) -> bool {
if names.is_empty() {
return false;
}
const AUTHORISH_CONTEX: [&str; 2] = ["authorinfo", "author-info"];
if ancestor_has_keyword(element, &AUTHORISH_CONTEX, 4) {
return true;
}
let mut class_id = String::new();
if let Some(class) = element.value().attr("class") {
class_id.push_str(class);
}
if let Some(id) = element.value().attr("id") {
if !class_id.is_empty() {
class_id.push(' ');
}
class_id.push_str(id);
}
let class_id_lower = class_id.to_lowercase();
if class_id_lower.contains("authorinfo") || class_id_lower.contains("author-info") {
return true;
}
if let Some(section) = element.value().attr("section") {
if section.to_lowercase().contains("author") {
return true;
}
}
let mut normalized = raw_text.to_lowercase();
for name in names {
normalized = normalized.replace(&name.to_lowercase(), " ");
}
normalized = normalized.replace(['\u{00a0}', '\u{200b}', '\r', '\n'], " ");
normalized = normalized.replace(['.', ',', '–', '—', '-', '|', ':', ';', '/', '(', ')'], " ");
let tokens: Vec<_> = normalized
.split_whitespace()
.filter(|token| !token.is_empty())
.collect();
let semantic_name = element_has_semantic_name(element);
if tokens.is_empty() {
return true;
}
if tokens.iter().any(|token| looks_like_job_descriptor(token)) {
return true;
}
if semantic_name && tokens.iter().all(|token| *token == "by") {
return true;
}
false
}
fn looks_like_job_descriptor(token: &str) -> bool {
const JOB_KEYWORDS: [&str; 19] = [
"reporter",
"editor",
"writer",
"staff",
"senior",
"technologist",
"correspondent",
"columnist",
"analyst",
"producer",
"anchor",
"bureau",
"desk",
"spokesman",
"spokeswoman",
"spokesperson",
"contributor",
"team",
"author",
];
JOB_KEYWORDS.contains(&token)
}
const MONTH_KEYWORDS: [&str; 24] = [
"jan",
"january",
"feb",
"february",
"mar",
"march",
"apr",
"april",
"may",
"jun",
"june",
"jul",
"july",
"aug",
"august",
"sep",
"sept",
"september",
"oct",
"october",
"nov",
"november",
"dec",
"december",
];
fn should_prefer_dom_byline(existing: &str, dom: &str, confidence: DomBylineConfidence) -> bool {
let existing_clean = existing.trim();
let dom_clean = dom.trim();
if dom_clean.eq_ignore_ascii_case(existing_clean) {
return false;
}
if utils::looks_like_org_credit(existing_clean) && !utils::looks_like_org_credit(dom_clean) {
return true;
}
if utils::looks_like_dateline(existing_clean) && !utils::looks_like_dateline(dom_clean) {
return true;
}
if confidence == DomBylineConfidence::High
&& looks_like_caps_author(dom_clean)
&& !looks_like_caps_author(existing_clean)
{
return true;
}
let existing_lower = existing_clean.to_lowercase();
let dom_lower = dom_clean.to_lowercase();
let collapse = |s: &str| s.split_whitespace().collect::<Vec<_>>().join(" ");
let dom_collapsed = collapse(&dom_lower);
let existing_collapsed = collapse(&existing_lower);
if !dom_collapsed.contains(&existing_collapsed) {
return false;
}
let mut remainder = if let Some(idx) = dom_lower.find(&existing_lower) {
let mut rem = String::new();
rem.push_str(&dom_lower[..idx]);
rem.push_str(&dom_lower[idx + existing_lower.len()..]);
rem
} else {
dom_lower.clone()
};
remainder = remainder.replace(
[
'|', '-', '_', ',', '.', '–', '—', '(', ')', '[', ']', '{', '}', '"', '\'',
],
" ",
);
let mut tokens: Vec<&str> = remainder
.split_whitespace()
.filter(|token| !token.is_empty())
.collect();
if tokens.is_empty() {
return false;
}
tokens.retain(|token| {
let lower = token.trim();
if lower.is_empty() {
return false;
}
if lower.chars().all(|ch| ch.is_ascii_digit()) {
return false;
}
if lower == "by" || lower == "updated" || lower == "at" || lower == "am" || lower == "pm" {
return false;
}
!MONTH_KEYWORDS.contains(&lower)
});
if tokens.is_empty() {
return false;
}
true
}
fn should_prefer_caps_standfirst(existing: &str, candidate: &str) -> bool {
let existing_clean = existing.trim();
let candidate_clean = candidate.trim();
if candidate_clean.eq_ignore_ascii_case(existing_clean) {
return false;
}
if looks_like_caps_author(existing_clean) {
return false;
}
looks_like_caps_author(candidate_clean)
}
fn looks_like_caps_author(text: &str) -> bool {
let trimmed = text.trim();
if trimmed.is_empty() || !trimmed.chars().any(|c| c.is_whitespace()) {
return false;
}
let letters: Vec<char> = trimmed.chars().filter(|c| c.is_alphabetic()).collect();
if letters.len() < 3 {
return false;
}
if contains_caps_noise_token(trimmed) {
return false;
}
let uppercase = letters.iter().filter(|c| c.is_uppercase()).count();
uppercase * 10 >= letters.len() * 8
}
fn contains_caps_noise_token(text: &str) -> bool {
const NOISE_TOKENS: [&str; 13] = [
"views", "view", "votes", "vote", "post", "posts", "yes", "no", "hot", "stats", "trending",
"share", "sections",
];
text.split_whitespace().any(|token| {
let cleaned = token
.trim_matches(|c: char| !c.is_alphanumeric())
.to_lowercase();
!cleaned.is_empty() && NOISE_TOKENS.contains(&cleaned.as_str())
})
}
fn parent_byline_text(element: &ElementRef) -> Option<String> {
let parent_node = element.parent()?;
let parent = ElementRef::wrap(parent_node)?;
if is_ignorable_byline_context(&parent) {
return None;
}
if is_noise_byline_context(&parent) {
return None;
}
if !element_has_byline_keyword(&parent) {
return None;
}
let text = collect_byline_candidate_text(parent).trim().to_string();
match utils::clean_byline_text_with_reason(&text) {
utils::CleanBylineOutcome::Accepted(cleaned) => Some(cleaned),
utils::CleanBylineOutcome::DroppedOrgCredit | utils::CleanBylineOutcome::Dropped => None,
}
}
fn element_has_byline_keyword(element: &ElementRef) -> bool {
let class = element.value().attr("class").unwrap_or("").to_lowercase();
let id = element.value().attr("id").unwrap_or("").to_lowercase();
class.contains("byline")
|| class.contains("author")
|| class.contains("writer")
|| class.contains("credit")
|| id.contains("byline")
|| id.contains("author")
|| id.contains("writer")
|| id.contains("credit")
}
fn element_has_explicit_byline_marker(element: &ElementRef) -> bool {
let class = element.value().attr("class").unwrap_or("").to_lowercase();
let id = element.value().attr("id").unwrap_or("").to_lowercase();
class.contains("byline") || id.contains("byline")
}
fn is_priority_dom_candidate(candidate: &DomBylineCandidate, raw_caps: bool) -> bool {
raw_caps || utils::looks_like_byline(&candidate.text)
}
fn ancestor_has_keyword(element: &ElementRef, keywords: &[&str], max_depth: usize) -> bool {
let mut depth = 0;
let mut current = Some(*element);
while let Some(el) = current {
let class = el.value().attr("class").unwrap_or("").to_lowercase();
let id = el.value().attr("id").unwrap_or("").to_lowercase();
if keywords
.iter()
.any(|keyword| class.contains(keyword) || id.contains(keyword))
{
return true;
}
if depth >= max_depth {
break;
}
depth += 1;
current = el.parent().and_then(ElementRef::wrap);
}
false
}
fn is_ignorable_byline_context(element: &ElementRef) -> bool {
const KEYWORDS: [&str; 34] = [
"post-footer",
"entry-footer",
"article-footer",
"section-footer",
"postmeta",
"meta-footer",
"footer",
"profile",
"sidebar",
"widget",
"comment",
"bio",
"related-post",
"user-bylines",
"byline__body",
"byline__title",
"post-info",
"entry-byline",
"entry-author",
"assetauthor",
"contentpromo",
"promo",
"asset-author",
"videopromo",
"poponscroll",
"most-popular",
"popular-stories",
"videoslide",
"video-container",
"card-box",
"article-view-box",
"cardbox",
"article-content",
"story-info",
];
ancestor_has_keyword(element, &KEYWORDS, 16)
}
fn is_noise_byline_context(element: &ElementRef) -> bool {
const KEYWORDS: [&str; 27] = [
"videopromo",
"videoslide",
"video-slide",
"video-module",
"poponscroll",
"contentpromo",
"promo",
"popular",
"most-popular",
"popular-stories",
"more-stories",
"related",
"recirc",
"recommend",
"newsletter",
"signup",
"asset",
"social",
"share",
"gallery",
"slideshow",
"indepth",
"indepth-module",
"hot_stats",
"hot-stats",
"trending-badge",
"views",
];
ancestor_has_keyword(element, &KEYWORDS, 16)
}
fn extract_language_from_document(document: &Html) -> Option<String> {
if let Some(html_elem) = document.root_element().first_child() {
if let Some(node_ref) = scraper::ElementRef::wrap(html_elem) {
if node_ref.value().name() == "html" {
if let Some(lang) = node_ref.value().attr("lang") {
let lang = lang.trim();
if !lang.is_empty() {
return Some(lang.to_string());
}
}
}
}
}
if let Ok(meta_selector) =
Selector::parse("meta[http-equiv='Content-Language'], meta[http-equiv='content-language']")
{
for meta in document.select(&meta_selector) {
if let Some(content) = meta.value().attr("content") {
let lang = content.trim();
if !lang.is_empty() {
return Some(lang.to_string());
}
}
}
}
if let Ok(meta_selector) = Selector::parse("meta[name='lang'], meta[name='language']") {
for meta in document.select(&meta_selector) {
if let Some(content) = meta.value().attr("content") {
let lang = content.trim();
if !lang.is_empty() {
return Some(lang.to_string());
}
}
}
}
None
}
fn extract_title_from_document(document: &Html) -> Option<String> {
let title_selector = Selector::parse("title").unwrap();
let title_elem = document.select(&title_selector).next()?;
let orig_title = title_elem.text().collect::<String>().trim().to_string();
if orig_title.is_empty() {
return None;
}
let mut cur_title = orig_title.clone();
let mut title_had_hierarchical_separators = false;
fn word_count(s: &str) -> usize {
s.split_whitespace().count()
}
let sep_regex = regex::Regex::new(r"\s(\||\-|–|—|\\|/|>|»)\s").unwrap();
if sep_regex.is_match(&cur_title) {
title_had_hierarchical_separators = regex::Regex::new(r"\s[\\//>»]\s")
.unwrap()
.is_match(&cur_title);
let sep_matches: Vec<_> = sep_regex.find_iter(&orig_title).collect();
if let Some(last_sep) = sep_matches.last() {
cur_title = orig_title[..last_sep.start()].to_string();
if word_count(&cur_title) < 3 {
let first_sep_regex =
regex::Regex::new(r"(?i)^[^\|\-–—\\//>»]*[\|\-–—\\//>»]").unwrap();
cur_title = first_sep_regex.replace(&orig_title, "").to_string();
}
}
} else if cur_title.contains(": ") {
let h_selector = Selector::parse("h1, h2").unwrap();
let trimmed_title = cur_title.trim();
let has_matching_heading = document
.select(&h_selector)
.any(|h| h.text().collect::<String>().trim() == trimmed_title);
if !has_matching_heading {
if let Some(last_colon_pos) = cur_title.rfind(':') {
let after_colon = cur_title[(last_colon_pos + 1)..].trim().to_string();
if word_count(&after_colon) < 3 {
if let Some(first_colon_pos) = cur_title.find(':') {
let after_first = cur_title[(first_colon_pos + 1)..].trim().to_string();
let before_first = &cur_title[..first_colon_pos];
if word_count(before_first) > 5 {
cur_title = orig_title.clone();
} else {
cur_title = after_first;
}
}
} else {
cur_title = after_colon;
}
}
}
} else if cur_title.len() > 150 || cur_title.len() < 15 {
let h1_selector = Selector::parse("h1").unwrap();
let h1s: Vec<_> = document.select(&h1_selector).collect();
if h1s.len() == 1 {
cur_title = h1s[0].text().collect::<String>().trim().to_string();
}
}
cur_title = REGEXPS
.normalize
.replace_all(cur_title.trim(), " ")
.to_string();
let cur_word_count = word_count(&cur_title);
if cur_word_count <= 4 {
let orig_without_sep = sep_regex.replace_all(&orig_title, " ").to_string();
let orig_word_count = word_count(&orig_without_sep);
if !title_had_hierarchical_separators || cur_word_count != orig_word_count - 1 {
cur_title = orig_title;
}
}
Some(cur_title)
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
#[test]
fn test_json_ld_extraction() {
let html = r#"
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Article",
"headline": "Test Article",
"author": {"name": "John Doe"},
"description": "Test description"
}
</script>
</head>
</html>
"#;
let document = Html::parse_document(html);
let metadata = get_json_ld(&document);
assert_eq!(metadata.title, Some("Test Article".to_string()));
assert_eq!(metadata.byline, Some("John Doe".to_string()));
assert_eq!(metadata.excerpt, Some("Test description".to_string()));
}
#[test]
fn test_json_ld_image_extraction() {
let html = r#"
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Article",
"headline": "Test Article",
"image": "https://example.com/image.jpg"
}
</script>
</head>
</html>
"#;
let document = Html::parse_document(html);
let metadata = get_json_ld(&document);
assert_eq!(
metadata.image,
Some("https://example.com/image.jpg".to_string())
);
}
#[test]
fn test_json_ld_image_object_extraction() {
let html = r#"
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Article",
"headline": "Test Article",
"image": {
"@type": "ImageObject",
"url": "https://example.com/image.jpg",
"width": 1200,
"height": 630
}
}
</script>
</head>
</html>
"#;
let document = Html::parse_document(html);
let metadata = get_json_ld(&document);
assert_eq!(
metadata.image,
Some("https://example.com/image.jpg".to_string())
);
}
#[test]
fn test_json_ld_image_array_extraction() {
let html = r#"
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Article",
"headline": "Test Article",
"image": [
"https://example.com/image1.jpg",
"https://example.com/image2.jpg"
]
}
</script>
</head>
</html>
"#;
let document = Html::parse_document(html);
let metadata = get_json_ld(&document);
assert_eq!(
metadata.image,
Some("https://example.com/image1.jpg".to_string())
);
}
#[test]
fn test_meta_tag_extraction() {
let html = r#"
<html>
<head>
<meta property="og:title" content="OG Title" />
<meta name="author" content="Jane Smith" />
<meta property="og:description" content="OG Description" />
</head>
</html>
"#;
let document = Html::parse_document(html);
let json_ld = Metadata::default();
let metadata = get_article_metadata(&document, json_ld);
assert_eq!(metadata.title, Some("OG Title".to_string()));
assert_eq!(metadata.byline, Some("Jane Smith".to_string()));
assert_eq!(metadata.excerpt, Some("OG Description".to_string()));
}
#[test]
fn test_og_image_extraction() {
let html = r#"
<html>
<head>
<meta property="og:title" content="OG Title" />
<meta property="og:image" content="https://example.com/og-image.jpg" />
</head>
</html>
"#;
let document = Html::parse_document(html);
let metadata = get_article_metadata(&document, Metadata::default());
assert_eq!(
metadata.image,
Some("https://example.com/og-image.jpg".to_string())
);
}
#[test]
fn test_og_image_secure_url_priority() {
let html = r#"
<html>
<head>
<meta property="og:image" content="http://example.com/image.jpg" />
<meta property="og:image:secure_url" content="https://example.com/secure-image.jpg" />
</head>
</html>
"#;
let document = Html::parse_document(html);
let metadata = get_article_metadata(&document, Metadata::default());
assert_eq!(
metadata.image,
Some("https://example.com/secure-image.jpg".to_string())
);
}
#[test]
fn test_twitter_image_extraction() {
let html = r#"
<html>
<head>
<meta name="twitter:image" content="https://example.com/twitter-image.jpg" />
<meta name="twitter:image:alt" content="Twitter alt text" />
</head>
</html>
"#;
let document = Html::parse_document(html);
let metadata = get_article_metadata(&document, Metadata::default());
assert_eq!(
metadata.image,
Some("https://example.com/twitter-image.jpg".to_string())
);
}
#[test]
fn test_json_ld_image_takes_priority() {
let html = r#"
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Article",
"headline": "Test Article",
"image": "https://example.com/json-ld-image.jpg"
}
</script>
<meta property="og:image" content="https://example.com/og-image.jpg" />
</head>
</html>
"#;
let document = Html::parse_document(html);
let json_ld = get_json_ld(&document);
let metadata = get_article_metadata(&document, json_ld);
assert_eq!(
metadata.image,
Some("https://example.com/json-ld-image.jpg".to_string())
);
}
#[test]
fn test_link_image_src_extraction() {
let html = r#"
<html>
<head>
<link rel="image_src" href="https://example.com/link-image.jpg" />
</head>
</html>
"#;
let document = Html::parse_document(html);
let metadata = get_article_metadata(&document, Metadata::default());
assert_eq!(
metadata.image,
Some("https://example.com/link-image.jpg".to_string())
);
}
#[test]
fn test_itemprop_image_extraction() {
let html = r#"
<html>
<head>
<meta itemprop="image" content="https://example.com/itemprop-image.jpg" />
</head>
</html>
"#;
let document = Html::parse_document(html);
let metadata = get_article_metadata(&document, Metadata::default());
assert_eq!(
metadata.image,
Some("https://example.com/itemprop-image.jpg".to_string())
);
}
#[test]
fn test_thumbnail_meta_extraction() {
let html = r#"
<html>
<head>
<meta name="thumbnail" content="https://example.com/thumbnail.jpg" />
</head>
</html>
"#;
let document = Html::parse_document(html);
let metadata = get_article_metadata(&document, Metadata::default());
assert_eq!(
metadata.image,
Some("https://example.com/thumbnail.jpg".to_string())
);
}
#[test]
fn test_article_author_name_meta_is_respected() {
let html = r#"
<html>
<head>
<meta name="article:author_name" content="Hazel Sheffield" />
</head>
</html>
"#;
let document = Html::parse_document(html);
let metadata = get_article_metadata(&document, Metadata::default());
assert_eq!(metadata.byline, Some("Hazel Sheffield".to_string()));
}
#[test]
fn test_title_extraction() {
let html = r#"
<html>
<head>
<title>Article Title | Site Name</title>
</head>
</html>
"#;
let document = Html::parse_document(html);
let title = extract_title_from_document(&document);
assert!(title.is_some());
assert!(title.as_ref().unwrap().contains("Article Title"));
}
#[test]
fn test_title_extraction_colon() {
let html = r#"
<html>
<head>
<title>Site Name: Article Title</title>
</head>
</html>
"#;
let document = Html::parse_document(html);
let title = extract_title_from_document(&document);
assert!(title.is_some());
assert!(title.as_ref().unwrap().len() > 0);
}
#[test]
fn test_byline_extraction_from_document() {
let html = r#"
<html>
<body>
<article>
<a rel="author" href="/author/john">John Doe</a>
<p>Article content here</p>
</article>
</body>
</html>
"#;
let document = Html::parse_document(html);
let json_ld = Metadata::default();
let metadata = get_article_metadata(&document, json_ld);
assert_eq!(metadata.byline, Some("John Doe".to_string()));
}
#[test]
fn test_byline_extraction_from_class() {
let html = r#"
<html>
<body>
<article>
<p class="byline">By Jane Smith</p>
<p>Article content here</p>
</article>
</body>
</html>
"#;
let document = Html::parse_document(html);
let json_ld = Metadata::default();
let metadata = get_article_metadata(&document, json_ld);
assert!(metadata.byline.is_some());
assert!(metadata.byline.as_ref().unwrap().contains("Jane Smith"));
}
#[test]
fn test_byline_extraction_priority() {
let html = r#"
<html>
<head>
<meta name="author" content="Meta Author" />
</head>
<body>
<article>
<p class="byline">Document Author</p>
</article>
</body>
</html>
"#;
let document = Html::parse_document(html);
let json_ld = Metadata::default();
let metadata = get_article_metadata(&document, json_ld);
assert_eq!(metadata.byline, Some("Meta Author".to_string()));
}
#[test]
fn test_ignorable_byline_context_detects_footer() {
let html = r#"
<div class="post-footer">
<div class="post-footer-line">
<span class="post-author">Posted by <span itemprop="name">Jane Doe</span></span>
</div>
</div>
"#;
let fragment = Html::parse_fragment(html);
let selector = Selector::parse(".post-author").unwrap();
let elem = fragment.select(&selector).next().unwrap();
assert!(is_ignorable_byline_context(&elem));
}
#[test]
fn test_ignorable_byline_context_detects_profile_widget() {
let html = r#"
<div class="profile widget">
<a rel="author" href="/user/jane">Jane Doe</a>
</div>
"#;
let fragment = Html::parse_fragment(html);
let selector = Selector::parse("a[rel='author']").unwrap();
let elem = fragment.select(&selector).next().unwrap();
assert!(is_ignorable_byline_context(&elem));
}
#[test]
fn test_ignorable_byline_context_detects_byline_body_block() {
let html = r#"
<div class="user-bylines">
<div class="byline__body">
<a class="byline__author">Jane Doe</a>
<div class="byline__title">BuzzFeed News Reporter</div>
</div>
</div>
"#;
let fragment = Html::parse_fragment(html);
let selector = Selector::parse(".byline__author").unwrap();
let elem = fragment.select(&selector).next().unwrap();
assert!(is_ignorable_byline_context(&elem));
}
#[test]
fn test_user_bylines_block_is_ignored_during_extraction() {
let html = r#"
<html>
<body>
<header class="page-head">
<div class="user-bylines">
<div class="byline__body">
<a class="byline__author">Jane Doe</a>
<div class="byline__title">BuzzFeed News Reporter</div>
</div>
</div>
</header>
</body>
</html>
"#;
let document = Html::parse_document(html);
let json_ld = Metadata::default();
let metadata = get_article_metadata(&document, json_ld);
assert!(metadata.byline.is_none());
}
#[test]
fn test_article_author_class_outside_footer_is_respected() {
let html = r#"
<html>
<body>
<article>
<aside>
<p>
<span class="article-author" itemprop="author" itemscope itemtype="http://schema.org/Person">
<span itemprop="name">Nicolas Perriault</span>
</span>
</p>
</aside>
</article>
</body>
</html>
"#;
let document = Html::parse_document(html);
let metadata = get_article_metadata(&document, Metadata::default());
assert_eq!(metadata.byline, Some("Nicolas Perriault".to_string()));
}
#[test]
fn test_site_name_redundant_byline_is_removed() {
let html = r#"
<html>
<head>
<meta property="og:site_name" content="SIMPLYFOUND.COM | BY: Joe Wee"/>
</head>
<body>
<article>
<p class="byline">
<span itemprop="author" itemscope itemtype="http://schema.org/Person">
<span itemprop="name">Joe Wee</span>
</span>
</p>
</article>
</body>
</html>
"#;
let document = Html::parse_document(html);
let metadata = get_article_metadata(&document, Metadata::default());
assert!(metadata.byline.is_none());
}
#[test]
fn test_breitbart_byline_is_extracted() {
let html = fs::read_to_string("tests/test-pages/breitbart/source.html").unwrap();
let document = Html::parse_document(&html);
let selector = Selector::parse(".byline").unwrap();
let mut saw_lucas = false;
for elem in document.select(&selector) {
if is_ignorable_byline_context(&elem) || is_noise_byline_context(&elem) {
continue;
}
let text = collect_byline_candidate_text(elem).trim().to_string();
if text.contains("Lucas Nolan") {
saw_lucas = true;
break;
}
}
assert!(saw_lucas, "expected to find Lucas Nolan byline candidate");
let dom_byline = extract_byline_from_document(&document);
assert!(
dom_byline.is_some(),
"expected Breitbart byline to be detected"
);
}
#[test]
fn test_cnet_authorinfo_is_extracted() {
let html = fs::read_to_string("tests/test-pages/cnet/source.html").unwrap();
let document = Html::parse_document(&html);
let dom_byline = extract_byline_from_document(&document).map(|c| c.text);
assert_eq!(dom_byline, Some("Steven Musil".to_string()));
}
#[test]
fn test_herald_sun_caps_byline_overrides_meta() {
let html = fs::read_to_string("tests/test-pages/herald-sun-1/source.html").unwrap();
let document = Html::parse_document(&html);
let dom_byline = extract_byline_from_document(&document).expect("dom byline");
assert_eq!(dom_byline.text, "JOE HILDEBRAND");
assert_eq!(dom_byline.confidence, DomBylineConfidence::High);
assert!(
should_prefer_dom_byline("by: Laurie Oakes", &dom_byline.text, dom_byline.confidence),
"dom byline should override Laurie Oakes"
);
let metadata = get_article_metadata(&document, Metadata::default());
assert_eq!(metadata.byline, Some("JOE HILDEBRAND".to_string()));
}
#[test]
fn test_caps_author_detection() {
assert!(looks_like_caps_author("JOE HILDEBRAND"));
assert!(!looks_like_caps_author("Laurie Oakes"));
assert!(!looks_like_caps_author("TOP POST 653,817 VIEWS"));
}
#[test]
fn test_dom_byline_overrides_agency_credit() {
let html = r#"
<html>
<head>
<meta property="og:title" content="Titre" />
<meta name="author" content="AFP" />
</head>
<body>
<article>
<p class="byline">Par <span>Sébastien Farcis</span></p>
<p>Contenu principal</p>
</article>
</body>
</html>
"#;
let document = Html::parse_document(html);
let metadata = get_article_metadata(&document, Metadata::default());
assert_eq!(metadata.byline, Some("Par Sébastien Farcis".to_string()));
}
#[test]
fn test_dom_byline_overrides_dateline_meta() {
let html = r#"
<html>
<head>
<meta property="og:title" content="Titre" />
<meta name="author" content="CAIRO" />
</head>
<body>
<article>
<p class="byline">By <span>Erin Cunningham</span></p>
<p>Contenu principal</p>
</article>
</body>
</html>
"#;
let document = Html::parse_document(html);
let metadata = get_article_metadata(&document, Metadata::default());
assert_eq!(metadata.byline, Some("By Erin Cunningham".to_string()));
}
#[test]
fn test_wapo_byline_is_detected() {
let html = fs::read_to_string("tests/test-pages/wapo-1/source.html").unwrap();
let document = Html::parse_document(&html);
let selector = Selector::parse(".pb-byline").unwrap();
assert!(
document.select(&selector).next().is_some(),
"pb-byline element not found"
);
let elem = document.select(&selector).next().unwrap();
let text = collect_byline_candidate_text(elem.clone());
assert!(
text.contains("Erin Cunningham"),
"pb-byline text was {:?}",
text
);
let dom_byline = extract_byline_from_document(&document).expect("should detect DOM byline");
assert_eq!(dom_byline.text, "By Erin Cunningham");
}
}