use once_cell::sync::Lazy;
use regex::{Regex, RegexSet};
use std::collections::HashSet;
pub mod flags {
pub const FLAG_STRIP_UNLIKELYS: u32 = 0x1;
pub const FLAG_WEIGHT_CLASSES: u32 = 0x2;
pub const FLAG_CLEAN_CONDITIONALLY: u32 = 0x4;
}
pub mod defaults {
pub const DEFAULT_CHAR_THRESHOLD: usize = 500;
}
pub static DEFAULT_TAGS_TO_SCORE: Lazy<HashSet<&'static str>> = Lazy::new(|| {
["SECTION", "H2", "H3", "H4", "H5", "H6", "P", "TD", "PRE"]
.iter()
.copied()
.collect()
});
pub mod regexps {
use super::*;
pub static UNLIKELY_CANDIDATES: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?i)-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote").unwrap()
});
pub static OK_MAYBE_ITS_A_CANDIDATE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?i)and|article|body|column|content|main|mathjax|shadow").unwrap()
});
pub static POSITIVE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?i)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story").unwrap()
});
pub static NEGATIVE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?i)-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|footer|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|widget").unwrap()
});
pub static BYLINE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?i)byline|author|dateline|writtenby|p-author").unwrap());
pub static NORMALIZE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\s{2,}").unwrap());
pub static VIDEOS: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?i)//(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq|bilibili|live.bilibili)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)").unwrap()
});
pub static SHARE_ELEMENTS: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?i)(\b|_)(share|sharedaddy)(\b|_)").unwrap());
pub static TOKENIZE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\W+").unwrap());
pub static SRCSET_URL: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))").unwrap());
pub static B64_DATA_URL: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?i)^data:\s*([^\s;,]+)\s*;\s*base64\s*,").unwrap());
pub static JSON_LD_ARTICLE_TYPES: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$").unwrap()
});
pub static AD_WORDS: Lazy<Regex> = Lazy::new(|| {
Regex::new(
r"(?iu)^(ad(vertising|vertisement)?|pub(licité)?|werb(ung)?|广告|Реклама|Anuncio)$",
)
.unwrap()
});
pub static LOADING_WORDS: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?iu)^((loading|æ£åœ¨åŠ è½½|Загрузка|chargement|cargando)(…|\.\.\.)?)$").unwrap()
});
pub static TITLE_SEPARATOR: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\s[\|\-–—\\\/>»]\s").unwrap());
pub static TITLE_HIERARCHICAL: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\s[\\/>\u{00BB}]\s").unwrap());
pub static TITLE_FIRST_PART: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^[^\|\-–—\\\/>»]*[\|\-–—\\\/>»]").unwrap());
pub static CLASS_WEIGHT_SET: Lazy<RegexSet> = Lazy::new(|| {
RegexSet::new([
NEGATIVE.as_str(), POSITIVE.as_str(), ])
.unwrap()
});
pub static CANDIDATE_FILTER_SET: Lazy<RegexSet> = Lazy::new(|| {
RegexSet::new([
UNLIKELY_CANDIDATES.as_str(), OK_MAYBE_ITS_A_CANDIDATE.as_str(), ])
.unwrap()
});
pub static AD_LOADING_SET: Lazy<RegexSet> = Lazy::new(|| {
RegexSet::new([
AD_WORDS.as_str(), LOADING_WORDS.as_str(), ])
.unwrap()
});
}
pub static UNLIKELY_ROLES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
[
"menu",
"menubar",
"complementary",
"navigation",
"alert",
"alertdialog",
"dialog",
]
.iter()
.copied()
.collect()
});
pub static DIV_TO_P_ELEMS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
[
"BLOCKQUOTE",
"DL",
"DIV",
"IMG",
"OL",
"P",
"PRE",
"TABLE",
"UL",
]
.iter()
.copied()
.collect()
});
pub static ALTER_TO_DIV_EXCEPTIONS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
["DIV", "ARTICLE", "SECTION", "P", "OL", "UL"]
.iter()
.copied()
.collect()
});
pub static PRESENTATIONAL_ATTRIBUTES: &[&str] = &[
"align",
"background",
"bgcolor",
"border",
"cellpadding",
"cellspacing",
"frame",
"hspace",
"rules",
"style",
"valign",
"vspace",
];
pub static DEPRECATED_SIZE_ATTRIBUTE_ELEMS: Lazy<HashSet<&'static str>> =
Lazy::new(|| ["TABLE", "TH", "TD", "HR", "PRE"].iter().copied().collect());
pub static PHRASING_ELEMS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
[
"ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA", "DATALIST", "DFN",
"EM", "EMBED", "I", "IMG", "INPUT", "KBD", "LABEL", "MARK", "MATH", "METER", "NOSCRIPT",
"OBJECT", "OUTPUT", "PROGRESS", "Q", "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN",
"STRONG", "SUB", "SUP", "TEXTAREA", "TIME", "VAR", "WBR",
]
.iter()
.copied()
.collect()
});
const IMAGE_EXTS: [&[u8]; 5] = [b"jpg", b"jpeg", b"png", b"webp", b"avif"];
#[inline]
fn match_image_ext(bytes: &[u8], start: usize) -> Option<usize> {
for ext in IMAGE_EXTS {
if start + ext.len() <= bytes.len()
&& bytes[start..start + ext.len()]
.iter()
.zip(ext.iter())
.all(|(a, b)| a.eq_ignore_ascii_case(b))
{
return Some(ext.len());
}
}
None
}
#[inline]
pub fn has_image_extension(s: &str) -> bool {
let bytes = s.as_bytes();
for (i, &b) in bytes.iter().enumerate() {
if b == b'.' && match_image_ext(bytes, i + 1).is_some() {
return true;
}
}
false
}
#[inline]
pub fn has_image_srcset(s: &str) -> bool {
let bytes = s.as_bytes();
for (i, &b) in bytes.iter().enumerate() {
if b == b'.'
&& let Some(ext_len) = match_image_ext(bytes, i + 1)
{
let after = i + 1 + ext_len;
if after < bytes.len()
&& bytes[after].is_ascii_whitespace()
&& let Some(pos) = bytes[after..]
.iter()
.position(|&c| !c.is_ascii_whitespace())
&& bytes[after + pos].is_ascii_digit()
{
return true;
}
}
}
false
}
#[inline]
pub fn has_image_src(s: &str) -> bool {
let trimmed = s.trim();
if trimmed.is_empty() || trimmed.contains(char::is_whitespace) {
return false;
}
let bytes = trimmed.as_bytes();
for (i, &b) in bytes.iter().enumerate().rev() {
if b == b'.'
&& let Some(ext_len) = match_image_ext(bytes, i + 1)
{
let after = i + 1 + ext_len;
if after >= bytes.len() || bytes[after] == b'?' || bytes[after] == b'#' {
return true;
}
}
}
false
}