use std::collections::HashSet;
use std::sync::LazyLock;
use regex::Regex;
use url::Url;
use super::fetch::fetch_static;
static LOC: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?is)<loc>\s*([^<\s][^<]*?)\s*</loc>").unwrap());
const MAX_SITEMAP_DOCS: usize = 50;
pub async fn discover_sitemap_url(root_url: &str, timeout_secs: u64) -> Option<String> {
let parsed = Url::parse(root_url).ok()?;
let base = format!("{}://{}", parsed.scheme(), parsed.host_str()?);
let primary = format!("{base}/sitemap.xml");
if looks_like_sitemap(
&fetch_static(&primary, timeout_secs)
.await
.unwrap_or_default(),
) {
return Some(primary);
}
if let Ok(robots) = fetch_static(&format!("{base}/robots.txt"), timeout_secs).await {
for line in robots.lines() {
let trimmed = line.trim();
if trimmed.to_ascii_lowercase().starts_with("sitemap:") {
let url = trimmed["sitemap:".len()..].trim();
if !url.is_empty() {
return Some(url.to_string());
}
}
}
}
const ALTERNATES: &[&str] = &[
"/sitemap_index.xml",
"/sitemap-index.xml",
"/wp-sitemap.xml",
"/sitemap/sitemap.xml",
];
for alt in ALTERNATES {
let candidate = format!("{base}{alt}");
if looks_like_sitemap(
&fetch_static(&candidate, timeout_secs)
.await
.unwrap_or_default(),
) {
return Some(candidate);
}
}
None
}
pub async fn collect_sitemap_urls(sitemap_url: &str, timeout_secs: u64) -> Vec<String> {
let mut pages: Vec<String> = Vec::new();
let mut seen_pages: HashSet<String> = HashSet::new();
let mut visited: HashSet<String> = HashSet::new();
let mut queue: Vec<String> = vec![sitemap_url.to_string()];
let mut fetched = 0usize;
while let Some(current) = queue.pop() {
if !visited.insert(current.clone()) {
continue;
}
if fetched >= MAX_SITEMAP_DOCS {
tracing::debug!("web_scrape: sitemap crawl hit {MAX_SITEMAP_DOCS}-doc cap, stopping");
break;
}
fetched += 1;
let body = match fetch_static(¤t, timeout_secs).await {
Ok(b) => b,
Err(e) => {
tracing::debug!("web_scrape: sitemap {current} fetch failed: {e}");
continue;
}
};
let base = Url::parse(¤t).ok();
let locs = extract_locs(&body, base.as_ref());
if body.contains("<sitemapindex") {
queue.extend(locs);
} else {
for page in locs {
if seen_pages.insert(page.clone()) {
pages.push(page);
}
}
}
}
pages
}
pub fn looks_like_sitemap(body: &str) -> bool {
body.contains("<urlset") || body.contains("<sitemapindex")
}
pub fn extract_locs(xml: &str, base: Option<&Url>) -> Vec<String> {
LOC.captures_iter(xml)
.filter_map(|c| c.get(1))
.map(|m| decode_xml_entities(m.as_str().trim()))
.filter_map(|loc| {
if loc.starts_with("http://") || loc.starts_with("https://") {
Some(loc)
} else {
base.and_then(|b| b.join(&loc).ok()).map(|u| u.to_string())
}
})
.collect()
}
fn decode_xml_entities(s: &str) -> String {
s.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
.replace(""", "\"")
.replace("'", "'")
}