use crate::selectors::{JSONLD_SELECTOR, LINK_SELECTOR};
use crate::tools::clean::utils::canonicalize_domain;
use scraper::{ElementRef, Html, Selector};
use serde_json::Value;
use url::Url;
const MIN_SIBLING_GROUP_SIZE: usize = 3;
const MIN_COMMON_PREFIX_LEN: usize = 2;
const SINGLE_ELEMENT_PATTERN_LEN: usize = 1;
const MIN_PATTERN_LEN: usize = 2;
const MAX_PATTERN_RATIO: usize = 2;
const MAIN_TAG: &str = "main";
const JUNK_TAGS: &[&str] = &["script", "style", "iframe", "noscript"];
const NAV_TAGS: &[&str] = &["nav", "footer", "aside", "header"];
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
struct StructurePattern {
pub tags: Vec<String>,
}
#[derive(Debug)]
struct SiblingGroup {
in_main: bool,
in_navigation: bool,
pattern_len: usize,
siblings: Vec<String>,
}
impl SiblingGroup {
fn coverage(&self) -> usize {
self.pattern_len * self.siblings.len()
}
fn quantity(&self) -> usize {
self.siblings.len()
}
}
#[cfg(test)]
pub(super) fn map_siblings(html: &str, url: &str) -> Vec<String> {
let siblings = map_body_siblings(html);
map_sibling_link(&siblings, url)
}
pub(super) fn map_siblings_from_doc(doc: &Html, url: &str) -> Vec<String> {
let siblings = map_body_siblings_from_doc(doc);
map_sibling_link(&siblings, url)
}
#[cfg(test)]
pub(super) fn map_itemlist(html: &str, url: &str) -> Vec<String> {
let doc = Html::parse_document(html);
let itemlist = map_jsonld_itemlist_from_doc(&doc);
map_itemlist_link(&itemlist, &doc, url)
}
pub(super) fn map_itemlist_from_doc(doc: &Html, url: &str) -> Vec<String> {
let itemlist = map_jsonld_itemlist_from_doc(doc);
map_itemlist_link(&itemlist, doc, url)
}
#[cfg(test)]
pub(super) fn map_body_siblings(html: &str) -> Vec<String> {
let doc = Html::parse_document(html);
map_body_siblings_from_doc(&doc)
}
pub(super) fn map_body_siblings_from_doc(doc: &Html) -> Vec<String> {
let root = doc.root_element();
let mut all_sibling_groups: Vec<SiblingGroup> = Vec::new();
map_sibling_groups_recursive(&root, &mut all_sibling_groups);
let selected = all_sibling_groups.into_iter().max_by_key(|group| {
(
!group.in_navigation, group.in_main, group.coverage(), group.quantity(), group.pattern_len, )
});
selected.map(|group| group.siblings).unwrap_or_default()
}
fn clean_href(href: &str) -> String {
href.replace('\\', "")
.replace(""", "")
.replace(""", "")
.replace("'", "")
.replace("'", "")
.trim()
.trim_matches('"')
.trim_matches('\'')
.trim()
.to_string()
}
fn is_valid_scheme(url: &Url) -> bool {
matches!(url.scheme(), "http" | "https")
}
fn is_inside_tag(element: &ElementRef, tag_name: &str) -> bool {
let mut ancestor = element.parent();
while let Some(node) = ancestor {
if let Some(elem) = ElementRef::wrap(node) {
if elem.value().name() == tag_name {
return true;
}
}
ancestor = node.parent();
}
false
}
fn map_structure_pattern(element: &ElementRef) -> StructurePattern {
let tags: Vec<String> = element
.children()
.filter_map(ElementRef::wrap)
.map(|e| e.value().name().to_string())
.collect();
StructurePattern { tags }
}
fn map_sibling_groups_recursive<'a>(
element: &'a ElementRef<'a>,
all_groups: &mut Vec<SiblingGroup>,
) {
let children: Vec<_> = element
.children()
.filter_map(ElementRef::wrap)
.filter(|child| {
let tag = child.value().name();
!JUNK_TAGS.contains(&tag)
})
.collect();
if children.len() >= MIN_SIBLING_GROUP_SIZE {
let mut pattern_groups: Vec<(Vec<String>, Vec<usize>)> = Vec::new();
for (idx, child) in children.iter().enumerate() {
let pattern = map_structure_pattern(child);
let mut matched = false;
for (group_tags, indices) in pattern_groups.iter_mut() {
let min_len = group_tags.len().min(pattern.tags.len());
if min_len >= MIN_COMMON_PREFIX_LEN
&& group_tags[..min_len] == pattern.tags[..min_len]
{
indices.push(idx);
if pattern.tags.len() < group_tags.len() {
*group_tags = pattern.tags.clone();
}
matched = true;
break;
}
}
if !matched {
pattern_groups.push((pattern.tags.clone(), vec![idx]));
}
}
for (tags, indices) in pattern_groups {
if indices.len() >= MIN_SIBLING_GROUP_SIZE && !tags.is_empty() {
let siblings: Vec<String> = indices.iter().map(|&i| children[i].html()).collect();
if siblings.len() >= MIN_SIBLING_GROUP_SIZE {
let first_child = &children[indices[0]];
all_groups.push(SiblingGroup {
in_main: is_inside_tag(first_child, MAIN_TAG),
in_navigation: NAV_TAGS.iter().any(|tag| is_inside_tag(first_child, tag)),
pattern_len: SINGLE_ELEMENT_PATTERN_LEN,
siblings,
});
}
}
}
map_multi_element_patterns(&children, all_groups);
}
for child in children {
map_sibling_groups_recursive(&child, all_groups);
}
}
fn map_multi_element_patterns(children: &[ElementRef], all_groups: &mut Vec<SiblingGroup>) {
use std::collections::HashMap;
let n = children.len();
for pattern_len in MIN_PATTERN_LEN..=(n / MAX_PATTERN_RATIO) {
if n < pattern_len * MIN_SIBLING_GROUP_SIZE {
break;
}
let mut multi_pattern_groups: HashMap<Vec<StructurePattern>, Vec<usize>> = HashMap::new();
let mut idx = 0;
while idx + pattern_len <= n {
let pattern: Vec<StructurePattern> = (0..pattern_len)
.map(|offset| map_structure_pattern(&children[idx + offset]))
.collect();
multi_pattern_groups.entry(pattern).or_default().push(idx);
idx += 1;
}
for (pattern, start_indices) in multi_pattern_groups {
if start_indices.len() >= MIN_SIBLING_GROUP_SIZE {
let first = &pattern[0];
if pattern.iter().all(|p| p == first) {
continue;
}
let mut non_overlapping: Vec<usize> = Vec::new();
for &idx in &start_indices {
let overlaps = non_overlapping.iter().any(|&selected_idx| {
let range1 = selected_idx..(selected_idx + pattern_len);
let range2 = idx..(idx + pattern_len);
range1.contains(&idx)
|| range1.contains(&(idx + pattern_len - 1))
|| range2.contains(&selected_idx)
|| range2.contains(&(selected_idx + pattern_len - 1))
});
if !overlaps {
non_overlapping.push(idx);
}
}
if non_overlapping.len() >= MIN_SIBLING_GROUP_SIZE {
let siblings: Vec<String> = non_overlapping
.iter()
.map(|&start_idx| {
(0..pattern_len)
.map(|offset| children[start_idx + offset].html())
.collect::<Vec<_>>()
.join("")
})
.collect();
if siblings.len() >= MIN_SIBLING_GROUP_SIZE {
let first_child = &children[non_overlapping[0]];
all_groups.push(SiblingGroup {
in_main: is_inside_tag(first_child, MAIN_TAG),
in_navigation: NAV_TAGS
.iter()
.any(|tag| is_inside_tag(first_child, tag)),
pattern_len,
siblings,
});
}
}
}
}
}
}
pub(super) fn map_sibling_link(siblings: &[String], url: &str) -> Vec<String> {
let base = match Url::parse(url) {
Ok(u) => u,
Err(e) => {
eprintln!("Warning: Invalid URL '{}': {}", url, e);
return Vec::new();
}
};
siblings
.iter()
.filter_map(|html| {
let doc = Html::parse_fragment(html);
select_primary_link_in_document(&doc, &base)
})
.collect()
}
pub(super) fn map_jsonld_itemlist_from_doc(doc: &Html) -> Vec<Value> {
let mut itemlists = Vec::new();
for script in doc.select(&JSONLD_SELECTOR) {
let json_str = script.inner_html();
if let Ok(value) = serde_json::from_str::<Value>(&json_str) {
collect_itemlists(&value, &mut itemlists);
}
}
itemlists
}
fn collect_itemlists(value: &Value, out: &mut Vec<Value>) {
match value {
Value::Array(arr) => {
for item in arr {
collect_itemlists(item, out);
}
}
Value::Object(obj) => {
if obj
.get("@type")
.and_then(Value::as_str)
.map(|t| t.eq_ignore_ascii_case("ItemList"))
.unwrap_or(false)
{
out.push(Value::Object(obj.clone()));
}
if let Some(graph) = obj.get("@graph") {
collect_itemlists(graph, out);
}
if let Some(main_entity) = obj.get("mainEntity") {
collect_itemlists(main_entity, out);
}
}
_ => {}
}
}
pub(super) fn map_itemlist_link(itemlist: &[Value], doc: &Html, url: &str) -> Vec<String> {
let base = match Url::parse(url) {
Ok(u) => u,
Err(e) => {
eprintln!("Warning: Invalid URL '{}': {}", url, e);
return Vec::new();
}
};
itemlist
.iter()
.filter_map(|item| {
let elements = item.get("itemListElement")?.as_array()?;
Some(
elements
.iter()
.filter_map(|elem| {
let url_str = elem.get("url")?.as_str()?;
if let Some(anchor_id) = url_str.strip_prefix('#') {
if let Some(resolved) = map_anchor_to_link(anchor_id, doc, &base) {
return Some(resolved);
}
return None;
}
if let Ok(url) = Url::parse(url_str) {
if is_valid_scheme(&url) {
if let Some(fragment) = url.fragment() {
let hosts_match = match (url.host_str(), base.host_str()) {
(Some(url_host), Some(base_host)) => {
canonicalize_domain(url_host)
== canonicalize_domain(base_host)
}
_ => false,
};
if url.scheme() == base.scheme() && hosts_match {
if let Some(resolved) =
map_anchor_to_link(fragment, doc, &base)
{
return Some(resolved);
}
return None;
}
}
return Some(url.to_string());
}
}
base.join(url_str)
.ok()
.filter(is_valid_scheme)
.map(|u| u.to_string())
})
.collect::<Vec<String>>(),
)
})
.flatten()
.collect()
}
fn map_anchor_to_link(anchor_id: &str, doc: &Html, base: &Url) -> Option<String> {
let selector = Selector::parse(&format!("[id='{}']", anchor_id)).ok()?;
let element = doc.select(&selector).next()?;
select_primary_link_in_element(&element, base)
}
fn has_meaningful_text(text: &str) -> bool {
!text.trim().is_empty()
}
fn is_heading_link(link: &ElementRef, text: &str) -> bool {
if !has_meaningful_text(text) {
return false;
}
let tag = link.value().name();
if matches!(tag, "h1" | "h2" | "h3" | "h4") {
return true;
}
for heading in ["h1", "h2", "h3", "h4"].iter() {
if is_inside_tag(link, heading) {
return true;
}
}
for descendant in link.descendants() {
if let Some(elem) = ElementRef::wrap(descendant) {
match elem.value().name() {
"h1" | "h2" | "h3" | "h4" => return true,
"strong" | "b" if has_meaningful_text(text) => return true,
_ => {}
}
}
}
is_inside_tag(link, "strong") || is_inside_tag(link, "b")
}
fn is_utility_text(text: &str) -> bool {
matches!(
text.trim().to_ascii_lowercase().as_str(),
"share"
| "print"
| "save"
| "pin"
| "email"
| "tweet"
| "facebook"
| "pinterest"
| "linkedin"
| "reddit"
| "copy link"
| "comment"
| "buy"
)
}
fn normalize_text(text: &str) -> String {
text.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
.to_ascii_lowercase()
}
fn collect_heading_texts(element: &ElementRef) -> Vec<String> {
element
.descendants()
.filter_map(ElementRef::wrap)
.filter(|el| matches!(el.value().name(), "h1" | "h2" | "h3" | "h4"))
.map(|el| normalize_text(&el.text().collect::<String>()))
.filter(|text| !text.is_empty())
.collect()
}
fn link_matches_heading(link_text_norm: &str, headings: &[String]) -> bool {
headings.iter().any(|h| {
!h.is_empty()
&& !link_text_norm.is_empty()
&& (link_text_norm == *h || link_text_norm.contains(h) || h.contains(link_text_norm))
})
}
fn select_primary_link_in_element(element: &ElementRef, base: &Url) -> Option<String> {
let headings = collect_heading_texts(element);
let mut primary_text: Option<String> = None;
let mut fallback: Option<String> = None;
let mut heading_links: Vec<(String, String)> = Vec::new();
for link in element.select(&LINK_SELECTOR) {
let href_raw = match link.value().attr("href") {
Some(h) => h,
None => continue,
};
let href = clean_href(href_raw);
let url = if href.starts_with("//") {
let full_href = format!("{}:{}", base.scheme(), href);
match Url::parse(&full_href).ok() {
Some(u) => u,
None => continue,
}
} else {
match Url::parse(&href).ok().or_else(|| base.join(&href).ok()) {
Some(u) => u,
None => continue,
}
};
if !is_valid_scheme(&url) {
continue;
}
if fallback.is_none() {
fallback = Some(url.to_string());
}
let text_raw = link.text().collect::<String>();
let text_norm = normalize_text(&text_raw);
let is_heading =
is_heading_link(&link, &text_raw) || link_matches_heading(&text_norm, &headings);
let is_meaningful = has_meaningful_text(&text_raw) && !is_utility_text(&text_raw);
if is_heading {
heading_links.push((url.to_string(), text_norm.clone()));
}
if primary_text.is_none() && is_meaningful {
primary_text = Some(url.to_string());
}
}
let heading_link = match heading_links.len() {
0 => None,
1 => Some(heading_links[0].0.clone()),
_ => {
for (url, link_text) in &heading_links {
for h in &headings {
if link_text == h {
return Some(url.clone());
}
}
}
for (url, link_text) in &heading_links {
for h in &headings {
if !h.is_empty() && link_text.contains(h) {
return Some(url.clone());
}
}
}
for (url, link_text) in &heading_links {
for h in &headings {
if !link_text.is_empty() && h.contains(link_text) {
return Some(url.clone());
}
}
}
heading_links.last().map(|(url, _)| url.clone())
}
};
heading_link.or(primary_text).or(fallback)
}
fn select_primary_link_in_document(doc: &Html, base: &Url) -> Option<String> {
for node in doc.tree.nodes() {
if let Some(element) = ElementRef::wrap(node) {
if let Some(link) = select_primary_link_in_element(&element, base) {
return Some(link);
}
}
}
None
}