use kuchikiki::{parse_html, traits::TendrilSink};
use url::Url;
pub const EXCLUDE_NON_MAIN_TAGS: &[&str] = &[
"header",
"footer",
"nav",
"aside",
".header",
".top",
".navbar",
"#header",
".footer",
".bottom",
"#footer",
".sidebar",
".side",
".aside",
"#sidebar",
".modal",
".popup",
"#modal",
".overlay",
".ad",
".ads",
".advert",
"#ad",
".lang-selector",
".language",
"#language-selector",
".social",
".social-media",
".social-links",
"#social",
".menu",
".navigation",
"#nav",
".breadcrumbs",
"#breadcrumbs",
".share",
"#share",
".widget",
"#widget",
".cookie",
"#cookie",
".fc-decoration",
];
pub const FORCE_INCLUDE_MAIN_TAGS: &[&str] = &[
"#main",
".swoogo-cols",
".swoogo-text",
".swoogo-table-div",
".swoogo-space",
".swoogo-alert",
".swoogo-sponsors",
".swoogo-title",
".swoogo-tabs",
".swoogo-logo",
".swoogo-image",
".swoogo-button",
".swoogo-agenda",
];
pub struct CleanOptions<'a> {
pub url: &'a str,
pub exclude_tags: &'a [&'a str],
pub only_main_content: bool,
}
#[derive(Debug, thiserror::Error)]
pub enum CleanError {
#[error("url parse: {0}")]
Url(#[from] url::ParseError),
#[error("selector: {0}")]
Selector(String),
}
pub fn clean_html(html: &str, opts: &CleanOptions<'_>) -> Result<String, CleanError> {
let document = parse_html().one(html);
let url = Url::parse(opts.url)?;
for tag in ["head", "meta", "noscript", "style", "script"] {
while let Ok(hit) = document.select_first(tag) {
hit.as_node().detach();
}
}
for sel in opts.exclude_tags {
while let Ok(hit) = document.select_first(sel) {
hit.as_node().detach();
}
}
if opts.only_main_content {
for outer in EXCLUDE_NON_MAIN_TAGS.iter() {
let matches: Vec<_> = document
.select(outer)
.map_err(|_| CleanError::Selector((*outer).to_string()))?
.collect();
for node in matches {
let keep = FORCE_INCLUDE_MAIN_TAGS.iter().any(|inner| {
node.as_node()
.select(inner)
.is_ok_and(|mut it| it.next().is_some())
});
if !keep {
node.as_node().detach();
}
}
}
}
resolve_srcset(&document);
resolve_attr(&document, "img[src]", "src", &url);
resolve_attr(&document, "a[href]", "href", &url);
Ok(document.to_string())
}
fn resolve_srcset(document: &kuchikiki::NodeRef) {
let Ok(iter) = document.select("img[srcset]") else {
return;
};
let imgs: Vec<_> = iter.collect();
for img in imgs {
let attrs = img.attributes.borrow();
let Some(raw) = attrs.get("srcset") else {
continue;
};
let mut sources: Vec<(String, f64, bool)> = raw
.split(',')
.filter_map(|x| {
let tok: Vec<&str> = x.split_whitespace().collect();
if tok.is_empty() {
return None;
}
let last = *tok.last()?;
let (last, used) = if tok.len() > 1
&& !last.is_empty()
&& (last.ends_with('x') || last.ends_with('w'))
{
(last, true)
} else {
("1x", false)
};
let unit_idx = last.char_indices().last()?.0;
let size: f64 = last[..unit_idx].parse().ok()?;
let url_part = if used {
tok[..tok.len() - 1].join(" ")
} else {
tok.join(" ")
};
Some((url_part, size, last.ends_with('x')))
})
.collect();
if sources.iter().all(|(_, _, is_x)| *is_x) {
if let Some(src) = attrs.get("src") {
sources.push((src.to_string(), 1.0, true));
}
}
drop(attrs);
sources.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
if let Some(best) = sources.first() {
img.attributes.borrow_mut().insert("src", best.0.clone());
}
}
}
fn resolve_attr(document: &kuchikiki::NodeRef, selector: &str, attr: &str, base: &Url) {
let Ok(iter) = document.select(selector) else {
return;
};
for node in iter {
let old = {
let a = node.attributes.borrow();
match a.get(attr) {
Some(s) => s.to_string(),
None => continue,
}
};
if let Ok(new) = base.join(&old) {
node.attributes.borrow_mut().insert(attr, new.to_string());
}
}
}
pub fn remove_skip_to_content_links(input: &str) -> String {
const LABEL: &str = "Skip to Content";
let bytes = input.as_bytes();
let len = bytes.len();
let mut out = String::with_capacity(len);
let mut i = 0;
'outer: while i < len {
if bytes[i] == b'[' {
let label_start = i + 1;
let label_end = label_start + LABEL.len();
if label_end <= len && bytes[label_start..label_end].iter().all(|b| b.is_ascii()) {
let label_slice = &input[label_start..label_end];
if label_slice.eq_ignore_ascii_case(LABEL)
&& label_end + 3 <= len
&& bytes[label_end] == b']'
&& bytes[label_end + 1] == b'('
&& bytes[label_end + 2] == b'#'
{
let mut j = label_end + 3;
while j < len {
let ch = input[j..].chars().next().unwrap();
if ch == ')' {
i = j + ch.len_utf8();
continue 'outer;
}
j += ch.len_utf8();
}
}
}
}
let ch = input[i..].chars().next().unwrap();
out.push(ch);
i += ch.len_utf8();
}
out
}