use crate::tools::fetch::fetch_page;
use crate::types::{
CrawlArgs, CrawlError, CrawlResult, CrawlSummary, CrawledPage, DaedraError, DaedraResult,
VisitPageArgs,
};
use lazy_static::lazy_static;
use reqwest::Client;
use scraper::{Html, Selector};
use std::sync::Arc;
use std::time::Duration;
use tokio::sync::Semaphore;
use tracing::{info, warn};
use url::Url;
const USER_AGENT: &str = "Mozilla/5.0 (compatible; daedra-crawl/0.3.0; +https://github.com/dirmacs/daedra)";
const SITEMAP_MAX_BYTES: usize = 10 * 1024 * 1024;
const SITEMAP_TIMEOUT: Duration = Duration::from_secs(15);
const SITEMAP_CANDIDATES: &[&str] = &[
"/sitemap.xml",
"/sitemap_index.xml",
"/sitemap-index.xml",
"/wp-sitemap.xml",
];
lazy_static! {
static ref ANCHOR_SELECTOR: Selector = Selector::parse("a[href]").unwrap();
}
async fn fetch_sitemap_body(client: &Client, url: &Url) -> Option<String> {
let resp = match client
.get(url.clone())
.header("User-Agent", USER_AGENT)
.send()
.await
{
Ok(r) => r,
Err(e) => {
warn!("sitemap probe {} failed: {}", url, e);
return None;
}
};
if !resp.status().is_success() {
return None;
}
match resp.text().await {
Ok(b) if b.len() <= SITEMAP_MAX_BYTES => Some(b),
Ok(_) => {
warn!(
"sitemap {} exceeded {} bytes, skipping",
url, SITEMAP_MAX_BYTES
);
None
}
Err(e) => {
warn!("sitemap {} body read failed: {}", url, e);
None
}
}
}
async fn probe_sitemap_candidate(client: &Client, root: &Url, path: &str) -> Option<Vec<Url>> {
let url = root.join(path).ok()?;
let body = fetch_sitemap_body(client, &url).await?;
let urls = parse_sitemap(&body);
if urls.is_empty() {
None
} else {
info!("sitemap {} yielded {} URLs", url, urls.len());
Some(urls)
}
}
async fn discover_sitemap(client: &Client, root: &Url) -> DaedraResult<Option<Vec<Url>>> {
for candidate in SITEMAP_CANDIDATES {
if let Some(urls) = probe_sitemap_candidate(client, root, candidate).await {
return Ok(Some(urls));
}
}
Ok(None)
}
pub fn parse_sitemap(body: &str) -> Vec<Url> {
let mut out = Vec::new();
let mut in_loc = false;
let mut current = String::new();
let mut rest = body;
while let Some(open) = rest.find("<loc>") {
let after_open = &rest[open + "<loc>".len()..];
let Some(close) = after_open.find("</loc>") else {
break;
};
let loc_text = after_open[..close].trim();
if let Ok(parsed) = Url::parse(loc_text) {
if !out.iter().any(|existing: &Url| existing == &parsed) {
out.push(parsed);
}
}
rest = &after_open[close + "</loc>".len()..];
current.clear();
let _ = in_loc;
in_loc = false;
}
out
}
async fn discover_via_anchors(client: &Client, root: &Url, cap: usize) -> DaedraResult<Vec<Url>> {
let body = client
.get(root.clone())
.header("User-Agent", USER_AGENT)
.send()
.await
.map_err(|e| DaedraError::FetchError(format!("anchor discovery GET {} failed: {}", root, e)))?
.text()
.await
.map_err(|e| DaedraError::FetchError(format!("anchor discovery body {} failed: {}", root, e)))?;
let doc = Html::parse_document(&body);
let mut seen: Vec<Url> = Vec::new();
for a in doc.select(&ANCHOR_SELECTOR) {
if let Some(href) = a.value().attr("href") {
let absolute = match root.join(href) {
Ok(u) => u,
Err(_) => continue,
};
if absolute.origin() != root.origin() {
continue;
}
if seen.iter().all(|u| u != &absolute) {
seen.push(absolute);
if seen.len() >= cap {
break;
}
}
}
}
Ok(seen)
}
fn clamp_crawl_args(max_pages: usize, concurrency: usize) -> (usize, usize) {
(max_pages.max(1).min(500), concurrency.max(1).min(16))
}
fn rank_urls_by_path_length(urls: &mut [Url]) {
urls.sort_by_key(|u| u.path().len());
}
pub async fn crawl_site(args: CrawlArgs) -> DaedraResult<CrawlResult> {
let root = Url::parse(&args.root_url)
.map_err(|e| DaedraError::InvalidArguments(format!("invalid root_url: {}", e)))?;
let (max_pages, concurrency) = clamp_crawl_args(args.max_pages, args.concurrency);
let client = Client::builder()
.user_agent(USER_AGENT)
.timeout(SITEMAP_TIMEOUT)
.gzip(true)
.brotli(true)
.build()
.map_err(|e| DaedraError::FetchError(format!("http client build: {}", e)))?;
let (mut candidates, sitemap_found) = match discover_sitemap(&client, &root).await? {
Some(urls) => (urls, true),
None => {
let urls = discover_via_anchors(&client, &root, max_pages * 2).await?;
(urls, false)
}
};
rank_urls_by_path_length(&mut candidates);
candidates.truncate(max_pages);
info!(
root = %root,
sitemap_found,
candidates = candidates.len(),
concurrency,
"crawl_site starting"
);
let sem = Arc::new(Semaphore::new(concurrency));
let mut handles = Vec::with_capacity(candidates.len());
for url in candidates {
let sem = Arc::clone(&sem);
let args = VisitPageArgs {
url: url.to_string(),
selector: None,
include_images: false,
};
handles.push(tokio::spawn(async move {
let _permit = sem.acquire_owned().await.ok()?;
let result = fetch_page(&args).await;
Some((args.url, result))
}));
}
let mut pages: Vec<CrawledPage> = Vec::new();
let mut errors: Vec<CrawlError> = Vec::new();
for handle in handles {
match handle.await {
Ok(Some((url, Ok(page)))) => {
let links = page
.links
.unwrap_or_default()
.into_iter()
.map(|l| l.url)
.collect();
pages.push(CrawledPage {
url,
title: page.title,
markdown: page.content,
links,
});
}
Ok(Some((url, Err(e)))) => errors.push(CrawlError {
url,
error: e.to_string(),
}),
Ok(None) | Err(_) => {
}
}
}
Ok(CrawlResult {
root_url: root.to_string(),
sitemap_found,
summary: CrawlSummary {
requested: max_pages,
fetched: pages.len(),
failed: errors.len(),
},
pages,
errors,
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_sitemap_handles_urlset() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>https://example.com/</loc>
<lastmod>2026-01-01</lastmod>
</url>
<url>
<loc>https://example.com/about</loc>
</url>
<url>
<loc>https://example.com/docs/intro</loc>
</url>
</urlset>"#;
let urls = parse_sitemap(xml);
assert_eq!(urls.len(), 3, "expected 3 unique URLs from urlset");
assert_eq!(urls[0].as_str(), "https://example.com/");
assert_eq!(urls[2].path(), "/docs/intro");
}
#[test]
fn parse_sitemap_handles_sitemapindex() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<sitemap><loc>https://example.com/sitemap-1.xml</loc></sitemap>
<sitemap><loc>https://example.com/sitemap-2.xml</loc></sitemap>
</sitemapindex>"#;
let urls = parse_sitemap(xml);
assert_eq!(urls.len(), 2, "sitemap index should return its nested loc entries");
assert!(urls[0].path().ends_with("sitemap-1.xml"));
}
#[test]
fn parse_sitemap_drops_invalid_urls() {
let xml = r#"<urlset>
<url><loc>not-a-url</loc></url>
<url><loc>https://example.com/ok</loc></url>
<url><loc> </loc></url>
</urlset>"#;
let urls = parse_sitemap(xml);
assert_eq!(urls.len(), 1, "only the one valid URL should survive");
assert_eq!(urls[0].as_str(), "https://example.com/ok");
}
#[test]
fn parse_sitemap_deduplicates() {
let xml = r#"<urlset>
<url><loc>https://example.com/a</loc></url>
<url><loc>https://example.com/a</loc></url>
<url><loc>https://example.com/b</loc></url>
</urlset>"#;
let urls = parse_sitemap(xml);
assert_eq!(urls.len(), 2, "duplicates should collapse");
}
#[test]
fn parse_sitemap_empty_returns_empty_vec() {
assert!(parse_sitemap("").is_empty());
assert!(parse_sitemap("<?xml version=\"1.0\"?><urlset></urlset>").is_empty());
}
}