1use std::collections::{HashSet, VecDeque};
4use std::io::Read as _;
5use std::time::{Duration, Instant};
6
7use tokio::task::spawn_blocking;
8use url::Url;
9
10use crate::robots::{RobotsPolicy, RobotsRules};
11use crate::scope::{is_same_site, matches_scope, normalize_url};
12use crate::{bridge, net};
13
14const MAP_SITEMAP_MAX_BYTES: u64 = 50 * 1024 * 1024;
15const MAP_SITEMAP_MAX_DECOMPRESSED: u64 = 10 * 1024 * 1024;
16const MAP_GZIP_MAX_RATIO: u64 = 100;
17const MAP_HTML_MAX_BYTES: u64 = 2 * 1024 * 1024;
18const MAP_MAX_REDIRECTS: u8 = 5;
19const MAP_MAX_SITEMAPS: usize = 200;
20const MAP_MAX_INDEX_DEPTH: u8 = 5;
21const MAP_MIN_FETCH_INTERVAL: Duration = Duration::from_millis(500);
22const MAP_URL_MAX_LEN: usize = 2048;
23const HTML_SNIFF_LEN: usize = 100;
24
25#[must_use = "options do nothing until passed to map()"]
27#[derive(Debug, Clone)]
28pub struct MapOptions {
29 url: String,
30 limit: usize,
31 include: Vec<String>,
32 exclude: Vec<String>,
33 user_agent: Option<String>,
34 timeout: u64,
35 no_fallback: bool,
36}
37
38impl MapOptions {
39 pub fn new(url: impl Into<String>) -> Self {
41 Self {
42 url: url.into(),
43 limit: 5000,
44 include: Vec::new(),
45 exclude: Vec::new(),
46 user_agent: None,
47 timeout: 30,
48 no_fallback: false,
49 }
50 }
51
52 pub fn limit(mut self, n: usize) -> Self {
54 self.limit = n;
55 self
56 }
57
58 pub fn include(mut self, patterns: &[&str]) -> Self {
60 self.include = patterns.iter().map(|s| (*s).to_string()).collect();
61 self
62 }
63
64 pub fn exclude(mut self, patterns: &[&str]) -> Self {
66 self.exclude = patterns.iter().map(|s| (*s).to_string()).collect();
67 self
68 }
69
70 pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
72 self.user_agent = Some(ua.into());
73 self
74 }
75
76 pub fn timeout(mut self, secs: u64) -> Self {
78 self.timeout = secs;
79 self
80 }
81
82 pub fn no_fallback(mut self, yes: bool) -> Self {
84 self.no_fallback = yes;
85 self
86 }
87}
88
89#[derive(Debug, Clone, serde::Serialize)]
91pub struct MappedUrl {
92 pub url: String,
94 #[serde(skip_serializing_if = "Option::is_none")]
96 pub lastmod: Option<String>,
97}
98
99pub fn map_blocking(opts: &MapOptions) -> crate::error::Result<Vec<MappedUrl>> {
101 crate::runtime::block_on(map(opts)).map_err(|e| crate::error::Error::engine(e, None))?
102}
103
104pub async fn map(opts: &MapOptions) -> crate::error::Result<Vec<MappedUrl>> {
106 net::ensure_crypto_provider();
107 let seed = net::validate_url(&opts.url)?;
108
109 let include = if opts.include.is_empty() {
110 None
111 } else {
112 Some(crate::scope::build_globset(&opts.include)?)
113 };
114 let exclude = if opts.exclude.is_empty() {
115 None
116 } else {
117 Some(crate::scope::build_globset(&opts.exclude)?)
118 };
119
120 let internal = MapConfig {
121 seed,
122 limit: opts.limit,
123 include,
124 exclude,
125 user_agent: opts.user_agent.clone(),
126 timeout: Duration::from_secs(opts.timeout),
127 no_fallback: opts.no_fallback,
128 };
129
130 let mut results = Vec::new();
131 run(&internal, |entry| {
132 results.push(MappedUrl {
133 url: entry.url.clone(),
134 lastmod: entry.lastmod.clone(),
135 });
136 })
137 .await;
138 Ok(results)
139}
140
141pub(crate) struct MapConfig {
143 pub seed: Url,
144 pub limit: usize,
145 pub include: Option<globset::GlobSet>,
146 pub exclude: Option<globset::GlobSet>,
147 pub user_agent: Option<String>,
148 pub timeout: Duration,
149 pub no_fallback: bool,
150}
151
152#[derive(serde::Serialize)]
154pub(crate) struct MapEntry {
155 pub url: String,
156 #[serde(skip_serializing_if = "Option::is_none")]
157 pub lastmod: Option<String>,
158}
159
160pub(crate) async fn run(opts: &MapConfig, mut on_url: impl FnMut(&MapEntry)) {
162 let ua = opts
163 .user_agent
164 .as_deref()
165 .unwrap_or_else(|| bridge::default_user_agent());
166 let agent = build_agent(ua, opts.timeout);
167
168 let robots = {
169 let seed = opts.seed.clone();
170 let user_agent = opts.user_agent.clone();
171 let timeout = opts.timeout;
172 spawn_blocking(move || RobotsRules::fetch(&seed, user_agent.as_deref(), timeout))
173 .await
174 .unwrap_or(RobotsPolicy::Unreachable)
175 };
176
177 let mut visited = HashSet::new();
178 let mut count = 0;
179 let mut last_fetch = Instant::now()
180 .checked_sub(MAP_MIN_FETCH_INTERVAL)
181 .unwrap_or_else(Instant::now);
182 let mut sitemap_queue: VecDeque<(Url, u8)> = discover_sitemaps(&robots, &opts.seed)
183 .into_iter()
184 .map(|u| (u, 0))
185 .collect();
186 let mut sitemaps_fetched = 0;
187
188 while let Some((sitemap_url, depth)) = sitemap_queue.pop_front() {
189 if sitemaps_fetched >= MAP_MAX_SITEMAPS || count >= opts.limit {
190 break;
191 }
192 if depth > MAP_MAX_INDEX_DEPTH || !is_same_site(&opts.seed, &sitemap_url) {
193 continue;
194 }
195
196 throttle(&mut last_fetch).await;
197 sitemaps_fetched += 1;
198
199 let body = {
200 let agent = agent.clone();
201 spawn_blocking({
202 let seed = opts.seed.clone();
203 move || fetch_sitemap(&agent, &sitemap_url, &seed)
204 })
205 .await
206 .ok()
207 .flatten()
208 };
209 let Some(body) = body else { continue };
210
211 for entry in parse_sitemap(&body) {
212 match entry {
213 SitemapEntry::Url { loc, lastmod } => {
214 if count >= opts.limit {
215 break;
216 }
217 if let Some(e) = validate_entry(&loc, lastmod, &opts.seed, &robots, opts, &mut visited) {
218 on_url(&e);
219 count += 1;
220 }
221 }
222 SitemapEntry::Sitemap { loc } => {
223 if let Ok(url) = Url::parse(&loc) {
224 sitemap_queue.push_back((url, depth + 1));
225 }
226 }
227 }
228 }
229 }
230
231 if count == 0 && !opts.no_fallback {
232 throttle(&mut last_fetch).await;
233 let html = {
234 let agent = agent.clone();
235 let seed = opts.seed.clone();
236 spawn_blocking(move || fetch_html(&agent, &seed)).await.ok().flatten()
237 };
238 if let Some(html) = html {
239 for link in extract_links(&html, &opts.seed) {
240 if count >= opts.limit {
241 break;
242 }
243 if let Some(e) = validate_entry(link.as_str(), None, &opts.seed, &robots, opts, &mut visited) {
244 on_url(&e);
245 count += 1;
246 }
247 }
248 }
249 }
250}
251
252fn discover_sitemaps(robots: &RobotsPolicy, seed: &Url) -> Vec<Url> {
253 let mut urls = Vec::new();
254 if let RobotsPolicy::Rules(rules) = robots {
255 urls.extend(rules.sitemaps.iter().cloned());
256 }
257 if let Ok(default) = seed.join("/sitemap.xml") {
258 if !urls.contains(&default) {
259 urls.push(default);
260 }
261 }
262 urls
263}
264
265fn build_agent(ua: &str, timeout: Duration) -> ureq::Agent {
266 ureq::Agent::new_with_config(
267 ureq::config::Config::builder()
268 .max_redirects(0)
269 .http_status_as_error(false)
270 .timeout_global(Some(timeout))
271 .user_agent(ua)
272 .build(),
273 )
274}
275
276fn fetch_following_redirects(agent: &ureq::Agent, url: &Url, seed: &Url) -> Option<ureq::http::Response<ureq::Body>> {
277 let mut current = url.clone();
278 for _ in 0..MAP_MAX_REDIRECTS {
279 let resp = agent.get(current.as_str()).call().ok()?;
280 let status = resp.status().as_u16();
281 if matches!(status, 301 | 302 | 303 | 307 | 308) {
282 let location = resp.headers().get("location")?.to_str().ok()?;
283 let next = current.join(location).ok()?;
284 if net::validate_url_with_policy(next.as_str(), bridge::engine_policy()).is_err()
285 || !is_same_site(seed, &next)
286 {
287 return None;
288 }
289 current = next;
290 continue;
291 }
292 if status >= 400 {
293 return None;
294 }
295 return Some(resp);
296 }
297 None
298}
299
300fn fetch_sitemap(agent: &ureq::Agent, url: &Url, seed: &Url) -> Option<String> {
301 let resp = fetch_following_redirects(agent, url, seed)?;
302 let content_type = resp
303 .headers()
304 .get("content-type")
305 .and_then(|v| v.to_str().ok())
306 .unwrap_or("");
307
308 let is_gzip = url
309 .path()
310 .rsplit('/')
311 .next()
312 .and_then(|seg| std::path::Path::new(seg).extension())
313 .is_some_and(|ext| ext.eq_ignore_ascii_case("gz"))
314 || content_type.contains("gzip")
315 || resp
316 .headers()
317 .get("content-encoding")
318 .and_then(|v| v.to_str().ok())
319 .is_some_and(|v| v.contains("gzip"));
320
321 if is_gzip {
322 let bytes = resp
323 .into_body()
324 .with_config()
325 .limit(MAP_SITEMAP_MAX_BYTES)
326 .read_to_vec()
327 .ok()?;
328 let mut decoded = Vec::new();
329 flate2::read::GzDecoder::new(bytes.as_slice())
330 .take(MAP_SITEMAP_MAX_DECOMPRESSED)
331 .read_to_end(&mut decoded)
332 .ok()?;
333 if decoded.len() as u64 > bytes.len() as u64 * MAP_GZIP_MAX_RATIO {
334 return None;
335 }
336 if looks_like_html(&decoded) {
337 return None;
338 }
339 String::from_utf8(decoded).ok()
340 } else {
341 let body = resp
342 .into_body()
343 .with_config()
344 .limit(MAP_SITEMAP_MAX_BYTES)
345 .read_to_string()
346 .ok()?;
347 if looks_like_html(body.as_bytes()) {
348 return None;
349 }
350 Some(body)
351 }
352}
353
354fn looks_like_html(bytes: &[u8]) -> bool {
355 const DOCTYPE: &[u8] = b"<!doctype";
356 const HTML: &[u8] = b"<html";
357 const BOM: &[u8] = b"\xef\xbb\xbf";
358 let mut prefix = bytes.get(..HTML_SNIFF_LEN).unwrap_or(bytes);
359 if prefix.starts_with(BOM) {
360 prefix = &prefix[BOM.len()..];
361 }
362 let prefix = prefix
363 .iter()
364 .position(|b| !b.is_ascii_whitespace())
365 .map_or(&[][..], |i| &prefix[i..]);
366 prefix
367 .get(..DOCTYPE.len())
368 .is_some_and(|p| p.eq_ignore_ascii_case(DOCTYPE))
369 || prefix.get(..HTML.len()).is_some_and(|p| p.eq_ignore_ascii_case(HTML))
370}
371
372fn fetch_html(agent: &ureq::Agent, url: &Url) -> Option<String> {
373 let resp = fetch_following_redirects(agent, url, url)?;
374 resp.into_body()
375 .with_config()
376 .limit(MAP_HTML_MAX_BYTES)
377 .read_to_string()
378 .ok()
379}
380
381fn extract_links(html: &str, base: &Url) -> Vec<Url> {
382 dom_query::Document::from(html)
383 .select("a[href]")
384 .iter()
385 .filter_map(|el| {
386 let href = el.attr("href")?;
387 let href = href.trim();
388 if href.is_empty() {
389 return None;
390 }
391 let resolved = base.join(href).ok()?;
392 matches!(resolved.scheme(), "http" | "https").then_some(resolved)
393 })
394 .collect()
395}
396
397enum SitemapEntry {
398 Url { loc: String, lastmod: Option<String> },
399 Sitemap { loc: String },
400}
401
402fn parse_sitemap(body: &str) -> Vec<SitemapEntry> {
403 use quick_xml::events::Event;
404 use quick_xml::reader::Reader;
405
406 let mut reader = Reader::from_str(body);
407 let mut entries = Vec::new();
408 let mut buf = Vec::new();
409 let mut capture = Capture::Idle;
410 let mut loc = String::new();
411 let mut lastmod = String::new();
412 let mut in_url = false;
413 let mut in_sitemap = false;
414 let mut depth: u32 = 0;
415
416 loop {
417 match reader.read_event_into(&mut buf) {
418 Ok(Event::Start(e)) => {
419 let name = e.local_name();
420 match name.as_ref() {
421 b"url" => {
422 in_url = true;
423 depth = 0;
424 }
425 b"sitemap" => {
426 in_sitemap = true;
427 depth = 0;
428 }
429 b"loc" if (in_url || in_sitemap) && depth == 0 => capture = Capture::Loc,
430 b"lastmod" if in_url && depth == 0 => capture = Capture::Lastmod,
431 _ if in_url || in_sitemap => depth += 1,
432 _ => {}
433 }
434 }
435 Ok(Event::Text(e)) => {
436 if let Ok(text) = e.xml10_content() {
437 match capture {
438 Capture::Loc => loc.push_str(text.trim()),
439 Capture::Lastmod => lastmod.push_str(text.trim()),
440 Capture::Idle => {}
441 }
442 } else {
443 loc.clear();
444 lastmod.clear();
445 capture = Capture::Idle;
446 }
447 }
448 Ok(Event::GeneralRef(e)) => {
449 let resolved = match &*e {
450 b"amp" => "&",
451 b"lt" => "<",
452 b"gt" => ">",
453 b"quot" => "\"",
454 b"apos" => "'",
455 _ => "",
456 };
457 match capture {
458 Capture::Loc => loc.push_str(resolved),
459 Capture::Lastmod => lastmod.push_str(resolved),
460 Capture::Idle => {}
461 }
462 }
463 Ok(Event::End(e)) => {
464 let name = e.local_name();
465 match name.as_ref() {
466 b"url" if in_url => {
467 if !loc.is_empty() {
468 let lm = if lastmod.is_empty() {
469 None
470 } else {
471 Some(std::mem::take(&mut lastmod))
472 };
473 entries.push(SitemapEntry::Url {
474 loc: std::mem::take(&mut loc),
475 lastmod: lm,
476 });
477 }
478 loc.clear();
479 lastmod.clear();
480 in_url = false;
481 }
482 b"sitemap" if in_sitemap => {
483 if !loc.is_empty() {
484 entries.push(SitemapEntry::Sitemap {
485 loc: std::mem::take(&mut loc),
486 });
487 }
488 loc.clear();
489 lastmod.clear();
490 in_sitemap = false;
491 }
492 b"loc" | b"lastmod" if capture != Capture::Idle => capture = Capture::Idle,
493 _ if depth > 0 => depth -= 1,
494 _ => {}
495 }
496 }
497 Ok(Event::Eof) | Err(_) => break,
498 _ => {}
499 }
500 buf.clear();
501 }
502
503 entries
504}
505
506#[derive(Clone, Copy, Debug, PartialEq, Eq)]
507enum Capture {
508 Idle,
509 Loc,
510 Lastmod,
511}
512
513fn validate_entry(
514 loc: &str,
515 lastmod: Option<String>,
516 seed: &Url,
517 robots: &RobotsPolicy,
518 opts: &MapConfig,
519 visited: &mut HashSet<String>,
520) -> Option<MapEntry> {
521 if loc.len() > MAP_URL_MAX_LEN {
522 return None;
523 }
524 let url = Url::parse(loc)
525 .ok()
526 .filter(|u| matches!(u.scheme(), "http" | "https"))?;
527 if !is_same_site(seed, &url) {
528 return None;
529 }
530 if !robots.is_allowed(&url) {
531 return None;
532 }
533 if !matches_scope(&url, opts.include.as_ref(), opts.exclude.as_ref()) {
534 return None;
535 }
536 let normalized = normalize_url(&url);
537 if !visited.insert(normalized.clone()) {
538 return None;
539 }
540 Some(MapEntry {
541 url: normalized,
542 lastmod,
543 })
544}
545
546async fn throttle(last_fetch: &mut Instant) {
547 let elapsed = last_fetch.elapsed();
548 if elapsed < MAP_MIN_FETCH_INTERVAL {
549 tokio::time::sleep(MAP_MIN_FETCH_INTERVAL.saturating_sub(elapsed)).await;
550 }
551 *last_fetch = Instant::now();
552}
553
554#[cfg(test)]
555mod tests {
556 use super::*;
557
558 fn test_config(seed: &str) -> MapConfig {
559 MapConfig {
560 seed: Url::parse(seed).unwrap(),
561 limit: 100,
562 include: None,
563 exclude: None,
564 user_agent: None,
565 timeout: Duration::from_secs(30),
566 no_fallback: false,
567 }
568 }
569
570 #[test]
571 fn parse_urlset() {
572 let xml = r#"<?xml version="1.0"?>
573<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
574 <url><loc>https://example.com/a</loc><lastmod>2026-01-01</lastmod></url>
575 <url><loc>https://example.com/b</loc></url>
576</urlset>"#;
577 let entries = parse_sitemap(xml);
578 assert_eq!(entries.len(), 2);
579 match &entries[0] {
580 SitemapEntry::Url { loc, lastmod } => {
581 assert_eq!(loc, "https://example.com/a");
582 assert_eq!(lastmod.as_deref(), Some("2026-01-01"));
583 }
584 SitemapEntry::Sitemap { .. } => panic!("expected Url"),
585 }
586 match &entries[1] {
587 SitemapEntry::Url { loc, lastmod } => {
588 assert_eq!(loc, "https://example.com/b");
589 assert!(lastmod.is_none());
590 }
591 SitemapEntry::Sitemap { .. } => panic!("expected Url"),
592 }
593 }
594
595 #[test]
596 fn parse_sitemapindex() {
597 let xml = r#"<?xml version="1.0"?>
598<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
599 <sitemap><loc>https://example.com/sitemap1.xml</loc></sitemap>
600 <sitemap><loc>https://example.com/sitemap2.xml</loc></sitemap>
601</sitemapindex>"#;
602 let entries = parse_sitemap(xml);
603 assert_eq!(entries.len(), 2);
604 match &entries[0] {
605 SitemapEntry::Sitemap { loc } => assert_eq!(loc, "https://example.com/sitemap1.xml"),
606 SitemapEntry::Url { .. } => panic!("expected Sitemap"),
607 }
608 }
609
610 #[test]
611 fn parse_handles_xml_entities() {
612 let xml = r"<urlset><url><loc>https://example.com/a?b=1&c=2</loc></url></urlset>";
613 let entries = parse_sitemap(xml);
614 assert_eq!(entries.len(), 1);
615 match &entries[0] {
616 SitemapEntry::Url { loc, .. } => assert_eq!(loc, "https://example.com/a?b=1&c=2"),
617 SitemapEntry::Sitemap { .. } => panic!("expected Url"),
618 }
619 }
620
621 #[test]
622 fn parse_handles_namespaced_tags() {
623 let xml = r#"<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
624 xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">
625 <url>
626 <loc>https://example.com/page</loc>
627 <image:image><image:loc>https://example.com/img.png</image:loc></image:image>
628 </url>
629</urlset>"#;
630 let entries = parse_sitemap(xml);
631 assert_eq!(entries.len(), 1);
632 match &entries[0] {
633 SitemapEntry::Url { loc, .. } => assert_eq!(loc, "https://example.com/page"),
634 SitemapEntry::Sitemap { .. } => panic!("expected Url"),
635 }
636 }
637
638 #[test]
639 fn parse_loc_after_nested_extension() {
640 let xml = r#"<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
641 xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">
642 <url>
643 <image:image><image:loc>https://example.com/img.png</image:loc></image:image>
644 <loc>https://example.com/page</loc>
645 <lastmod>2026-01-01</lastmod>
646 </url>
647</urlset>"#;
648 let entries = parse_sitemap(xml);
649 assert_eq!(entries.len(), 1);
650 match &entries[0] {
651 SitemapEntry::Url { loc, lastmod } => {
652 assert_eq!(loc, "https://example.com/page");
653 assert_eq!(lastmod.as_deref(), Some("2026-01-01"));
654 }
655 SitemapEntry::Sitemap { .. } => panic!("expected Url"),
656 }
657 }
658
659 #[test]
660 fn parse_empty_body_returns_empty() {
661 assert!(parse_sitemap("").is_empty());
662 assert!(parse_sitemap("<html><body>Not Found</body></html>").is_empty());
663 }
664
665 #[test]
666 fn looks_like_html_detects_variants() {
667 assert!(looks_like_html(b"<!DOCTYPE html>"));
668 assert!(looks_like_html(b"<!doctype html>"));
669 assert!(looks_like_html(b"<html lang=\"en\">"));
670 assert!(looks_like_html(b"<HTML>"));
671 assert!(looks_like_html(b"\xef\xbb\xbf<!DOCTYPE html>"));
672 assert!(looks_like_html(b" \n<!doctype html>"));
673 assert!(looks_like_html(b"\xef\xbb\xbf <html>"));
674 assert!(!looks_like_html(b"<?xml version=\"1.0\"?>"));
675 assert!(!looks_like_html(b"<urlset>"));
676 assert!(!looks_like_html(b""));
677 }
678
679 #[test]
680 fn validate_entry_rejects_private_ip() {
681 let opts = test_config("https://example.com");
682
683 let mut visited = HashSet::new();
684 let result = validate_entry(
685 "http://127.0.0.1/secret",
686 None,
687 &opts.seed,
688 &RobotsPolicy::Unavailable,
689 &opts,
690 &mut visited,
691 );
692 assert!(result.is_none());
693 }
694
695 #[test]
696 fn validate_entry_rejects_cross_site() {
697 let opts = test_config("https://example.com");
698 let robots = RobotsPolicy::Unavailable;
699 let mut visited = HashSet::new();
700 let result = validate_entry("https://evil.com/page", None, &opts.seed, &robots, &opts, &mut visited);
701 assert!(result.is_none());
702 }
703
704 #[test]
705 fn validate_entry_deduplicates() {
706 let opts = test_config("https://example.com");
707 let robots = RobotsPolicy::Unavailable;
708 let mut visited = HashSet::new();
709 let first = validate_entry(
710 "https://example.com/page",
711 None,
712 &opts.seed,
713 &robots,
714 &opts,
715 &mut visited,
716 );
717 assert!(first.is_some());
718 let second = validate_entry(
719 "https://example.com/page",
720 None,
721 &opts.seed,
722 &robots,
723 &opts,
724 &mut visited,
725 );
726 assert!(second.is_none());
727 }
728
729 #[test]
730 fn validate_entry_rejects_long_url() {
731 let opts = test_config("https://example.com");
732 let robots = RobotsPolicy::Unavailable;
733 let mut visited = HashSet::new();
734 let long_url = format!("https://example.com/{}", "a".repeat(MAP_URL_MAX_LEN));
735 let result = validate_entry(&long_url, None, &opts.seed, &robots, &opts, &mut visited);
736 assert!(result.is_none());
737 }
738
739 #[test]
740 fn discover_sitemaps_includes_robots_and_default() {
741 let seed = Url::parse("https://example.com").unwrap();
742 let robots = RobotsPolicy::Rules(RobotsRules {
743 rules: Vec::new(),
744 sitemaps: vec![Url::parse("https://example.com/custom-sitemap.xml").unwrap()],
745 });
746 let sitemaps = discover_sitemaps(&robots, &seed);
747 assert_eq!(sitemaps.len(), 2);
748 assert_eq!(sitemaps[0].as_str(), "https://example.com/custom-sitemap.xml");
749 assert_eq!(sitemaps[1].as_str(), "https://example.com/sitemap.xml");
750 }
751
752 #[test]
753 fn discover_sitemaps_deduplicates_default() {
754 let seed = Url::parse("https://example.com").unwrap();
755 let robots = RobotsPolicy::Rules(RobotsRules {
756 rules: Vec::new(),
757 sitemaps: vec![Url::parse("https://example.com/sitemap.xml").unwrap()],
758 });
759 let sitemaps = discover_sitemaps(&robots, &seed);
760 assert_eq!(sitemaps.len(), 1);
761 }
762
763 mod integration {
764 use std::time::Duration;
765
766 use tokio::task::spawn_blocking;
767 use url::Url;
768 use wiremock::matchers::{method, path};
769 use wiremock::{Mock, MockServer, ResponseTemplate};
770
771 use crate::map::{
772 MapConfig, MapEntry, build_agent, extract_links, fetch_html, fetch_sitemap, parse_sitemap, run,
773 };
774
775 #[tokio::test]
776 async fn fetch_sitemap_parses_urlset() {
777 let server = MockServer::start().await;
778 let xml = r#"<?xml version="1.0"?><urlset><url><loc>https://example.com/a</loc></url></urlset>"#;
779 Mock::given(method("GET"))
780 .and(path("/sitemap.xml"))
781 .respond_with(ResponseTemplate::new(200).set_body_raw(xml.as_bytes().to_vec(), "application/xml"))
782 .mount(&server)
783 .await;
784
785 let agent = build_agent("test/1.0", Duration::from_secs(5));
786 let url = Url::parse(&format!("{}/sitemap.xml", server.uri())).unwrap();
787 let body = spawn_blocking(move || fetch_sitemap(&agent, &url, &url)).await.unwrap();
788
789 let entries = parse_sitemap(&body.unwrap());
790 assert_eq!(entries.len(), 1);
791 }
792
793 #[tokio::test]
794 async fn fetch_sitemap_rejects_html_error_page() {
795 let server = MockServer::start().await;
796 Mock::given(method("GET"))
797 .and(path("/sitemap.xml"))
798 .respond_with(ResponseTemplate::new(200).set_body_raw(
799 b"<!DOCTYPE html><html><body>Not Found</body></html>".to_vec(),
800 "text/html; charset=utf-8",
801 ))
802 .mount(&server)
803 .await;
804
805 let agent = build_agent("test/1.0", Duration::from_secs(5));
806 let url = Url::parse(&format!("{}/sitemap.xml", server.uri())).unwrap();
807 let body = spawn_blocking(move || fetch_sitemap(&agent, &url, &url)).await.unwrap();
808
809 assert!(body.is_none());
810 }
811
812 #[tokio::test]
813 async fn fetch_sitemap_returns_none_on_404() {
814 let server = MockServer::start().await;
815 Mock::given(method("GET"))
816 .and(path("/sitemap.xml"))
817 .respond_with(ResponseTemplate::new(404))
818 .mount(&server)
819 .await;
820
821 let agent = build_agent("test/1.0", Duration::from_secs(5));
822 let url = Url::parse(&format!("{}/sitemap.xml", server.uri())).unwrap();
823 let body = spawn_blocking(move || fetch_sitemap(&agent, &url, &url)).await.unwrap();
824
825 assert!(body.is_none());
826 }
827
828 #[tokio::test]
829 async fn fetch_sitemap_handles_gzip() {
830 use std::io::Write as _;
831
832 use flate2::Compression;
833 use flate2::write::GzEncoder;
834
835 let server = MockServer::start().await;
836 let xml = r#"<?xml version="1.0"?><urlset><url><loc>https://example.com/gz</loc></url></urlset>"#;
837 let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
838 encoder.write_all(xml.as_bytes()).unwrap();
839 let compressed = encoder.finish().unwrap();
840
841 Mock::given(method("GET"))
842 .and(path("/sitemap.xml.gz"))
843 .respond_with(ResponseTemplate::new(200).set_body_raw(compressed, "application/gzip"))
844 .mount(&server)
845 .await;
846
847 let agent = build_agent("test/1.0", Duration::from_secs(5));
848 let url = Url::parse(&format!("{}/sitemap.xml.gz", server.uri())).unwrap();
849 let body = spawn_blocking(move || fetch_sitemap(&agent, &url, &url)).await.unwrap();
850
851 let entries = parse_sitemap(&body.unwrap());
852 assert_eq!(entries.len(), 1);
853 }
854
855 #[tokio::test]
856 async fn fetch_html_extracts_links() {
857 let server = MockServer::start().await;
858 Mock::given(method("GET"))
859 .and(path("/"))
860 .respond_with(ResponseTemplate::new(200).set_body_raw(
861 br#"<html><body><a href="/link">x</a></body></html>"#.to_vec(),
862 "text/html; charset=utf-8",
863 ))
864 .mount(&server)
865 .await;
866
867 let agent = build_agent("test/1.0", Duration::from_secs(5));
868 let seed = Url::parse(&server.uri()).unwrap();
869 let html = spawn_blocking({
870 let seed = seed.clone();
871 move || fetch_html(&agent, &seed)
872 })
873 .await
874 .unwrap()
875 .unwrap();
876
877 let links = extract_links(&html, &seed);
878 assert_eq!(links.len(), 1);
879 }
880
881 async fn check_run(server: &MockServer, configure: impl FnOnce(&mut MapConfig)) -> Vec<MapEntry> {
882 let mut config = MapConfig {
883 seed: Url::parse(&server.uri()).unwrap(),
884 limit: 100,
885 include: None,
886 exclude: None,
887 user_agent: Some("test-bot".into()),
888 timeout: Duration::from_secs(5),
889 no_fallback: false,
890 };
891 configure(&mut config);
892 let mut entries = Vec::new();
893 run(&config, |e| {
894 entries.push(MapEntry {
895 url: e.url.clone(),
896 lastmod: e.lastmod.clone(),
897 });
898 })
899 .await;
900 entries
901 }
902
903 #[tokio::test]
904 async fn run_discovers_urls_from_sitemap() {
905 let server = MockServer::start().await;
906 Mock::given(method("GET"))
907 .and(path("/robots.txt"))
908 .respond_with(ResponseTemplate::new(200).set_body_string("User-agent: *\nAllow: /"))
909 .mount(&server)
910 .await;
911 let sitemap = format!(
912 "<urlset><url><loc>{}/page1</loc></url><url><loc>{}/page2</loc></url></urlset>",
913 server.uri(),
914 server.uri()
915 );
916 Mock::given(method("GET"))
917 .and(path("/sitemap.xml"))
918 .respond_with(ResponseTemplate::new(200).set_body_string(sitemap))
919 .mount(&server)
920 .await;
921
922 let entries = check_run(&server, |_| {}).await;
923 assert_eq!(entries.len(), 2);
924 assert!(entries.iter().any(|e| e.url.ends_with("/page1")));
925 assert!(entries.iter().any(|e| e.url.ends_with("/page2")));
926 }
927
928 #[tokio::test]
929 async fn run_respects_limit() {
930 let server = MockServer::start().await;
931 Mock::given(method("GET"))
932 .and(path("/robots.txt"))
933 .respond_with(ResponseTemplate::new(404))
934 .mount(&server)
935 .await;
936 let sitemap = format!(
937 "<urlset><url><loc>{}/a</loc></url><url><loc>{}/b</loc></url><url><loc>{}/c</loc></url></urlset>",
938 server.uri(),
939 server.uri(),
940 server.uri()
941 );
942 Mock::given(method("GET"))
943 .and(path("/sitemap.xml"))
944 .respond_with(ResponseTemplate::new(200).set_body_string(sitemap))
945 .mount(&server)
946 .await;
947
948 let entries = check_run(&server, |c| c.limit = 2).await;
949 assert_eq!(entries.len(), 2);
950 }
951
952 #[tokio::test]
953 async fn run_follows_sitemap_index() {
954 let server = MockServer::start().await;
955 Mock::given(method("GET"))
956 .and(path("/robots.txt"))
957 .respond_with(ResponseTemplate::new(404))
958 .mount(&server)
959 .await;
960 let index = format!(
961 "<sitemapindex><sitemap><loc>{}/sub.xml</loc></sitemap></sitemapindex>",
962 server.uri()
963 );
964 Mock::given(method("GET"))
965 .and(path("/sitemap.xml"))
966 .respond_with(ResponseTemplate::new(200).set_body_string(index))
967 .mount(&server)
968 .await;
969 let sub = format!("<urlset><url><loc>{}/deep</loc></url></urlset>", server.uri());
970 Mock::given(method("GET"))
971 .and(path("/sub.xml"))
972 .respond_with(ResponseTemplate::new(200).set_body_string(sub))
973 .mount(&server)
974 .await;
975
976 let entries = check_run(&server, |_| {}).await;
977 assert_eq!(entries.len(), 1);
978 assert!(entries[0].url.ends_with("/deep"));
979 }
980
981 #[tokio::test]
982 async fn run_falls_back_to_html_links() {
983 let server = MockServer::start().await;
984 Mock::given(method("GET"))
985 .and(path("/robots.txt"))
986 .respond_with(ResponseTemplate::new(404))
987 .mount(&server)
988 .await;
989 Mock::given(method("GET"))
990 .and(path("/sitemap.xml"))
991 .respond_with(ResponseTemplate::new(404))
992 .mount(&server)
993 .await;
994 let html = format!(
995 r#"<html><body><a href="{}/link1">L1</a><a href="{}/link2">L2</a></body></html>"#,
996 server.uri(),
997 server.uri()
998 );
999 Mock::given(method("GET"))
1000 .and(path("/"))
1001 .respond_with(ResponseTemplate::new(200).set_body_string(html))
1002 .mount(&server)
1003 .await;
1004
1005 let entries = check_run(&server, |_| {}).await;
1006 assert_eq!(entries.len(), 2);
1007 }
1008
1009 #[tokio::test]
1010 async fn run_no_fallback_skips_html() {
1011 let server = MockServer::start().await;
1012 Mock::given(method("GET"))
1013 .and(path("/robots.txt"))
1014 .respond_with(ResponseTemplate::new(404))
1015 .mount(&server)
1016 .await;
1017 Mock::given(method("GET"))
1018 .and(path("/sitemap.xml"))
1019 .respond_with(ResponseTemplate::new(404))
1020 .mount(&server)
1021 .await;
1022 Mock::given(method("GET"))
1023 .and(path("/"))
1024 .respond_with(
1025 ResponseTemplate::new(200).set_body_string(r#"<html><body><a href="/link">L</a></body></html>"#),
1026 )
1027 .mount(&server)
1028 .await;
1029
1030 let entries = check_run(&server, |c| c.no_fallback = true).await;
1031 assert_eq!(entries.len(), 0);
1032 }
1033
1034 #[tokio::test]
1035 async fn run_deduplicates_urls() {
1036 let server = MockServer::start().await;
1037 Mock::given(method("GET"))
1038 .and(path("/robots.txt"))
1039 .respond_with(ResponseTemplate::new(404))
1040 .mount(&server)
1041 .await;
1042 let sitemap = format!(
1043 "<urlset><url><loc>{}/dup</loc></url><url><loc>{}/dup</loc></url><url><loc>{}/unique</loc></url></urlset>",
1044 server.uri(),
1045 server.uri(),
1046 server.uri()
1047 );
1048 Mock::given(method("GET"))
1049 .and(path("/sitemap.xml"))
1050 .respond_with(ResponseTemplate::new(200).set_body_string(sitemap))
1051 .mount(&server)
1052 .await;
1053
1054 let entries = check_run(&server, |_| {}).await;
1055 assert_eq!(entries.len(), 2);
1056 }
1057 }
1058}