1use std::collections::{HashSet, VecDeque};
4use std::io::Read as _;
5use std::time::{Duration, Instant};
6
7use url::Url;
8
9use crate::bridge;
10use crate::net;
11use crate::robots::{RobotsPolicy, RobotsRules};
12use crate::scope::{is_same_site, matches_scope, normalize_url};
13
14const MAP_SITEMAP_MAX_BYTES: u64 = 50 * 1024 * 1024;
15const MAP_SITEMAP_MAX_DECOMPRESSED: u64 = 10 * 1024 * 1024;
16const MAP_GZIP_MAX_RATIO: u64 = 100;
17const MAP_HTML_MAX_BYTES: u64 = 2 * 1024 * 1024;
18const MAP_MAX_REDIRECTS: u8 = 5;
19const MAP_MAX_SITEMAPS: usize = 200;
20const MAP_MAX_INDEX_DEPTH: u8 = 5;
21const MAP_MIN_FETCH_INTERVAL: Duration = Duration::from_millis(500);
22const MAP_URL_MAX_LEN: usize = 2048;
23const HTML_SNIFF_LEN: usize = 100;
24
25#[must_use = "options do nothing until passed to map()"]
27#[derive(Debug, Clone)]
28pub struct MapOptions {
29 url: String,
30 limit: usize,
31 include: Vec<String>,
32 exclude: Vec<String>,
33 user_agent: Option<String>,
34 timeout: u64,
35 no_fallback: bool,
36}
37
38impl MapOptions {
39 pub fn new(url: impl Into<String>) -> Self {
41 Self {
42 url: url.into(),
43 limit: 5000,
44 include: Vec::new(),
45 exclude: Vec::new(),
46 user_agent: None,
47 timeout: 30,
48 no_fallback: false,
49 }
50 }
51
52 pub fn limit(mut self, n: usize) -> Self {
54 self.limit = n;
55 self
56 }
57
58 pub fn include(mut self, patterns: &[&str]) -> Self {
60 self.include = patterns.iter().map(|s| (*s).to_string()).collect();
61 self
62 }
63
64 pub fn exclude(mut self, patterns: &[&str]) -> Self {
66 self.exclude = patterns.iter().map(|s| (*s).to_string()).collect();
67 self
68 }
69
70 pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
72 self.user_agent = Some(ua.into());
73 self
74 }
75
76 pub fn timeout(mut self, secs: u64) -> Self {
78 self.timeout = secs;
79 self
80 }
81
82 pub fn no_fallback(mut self, yes: bool) -> Self {
84 self.no_fallback = yes;
85 self
86 }
87}
88
89#[derive(Debug, Clone, serde::Serialize)]
91pub struct MappedUrl {
92 pub url: String,
94 #[serde(skip_serializing_if = "Option::is_none")]
96 pub lastmod: Option<String>,
97}
98
99#[allow(clippy::needless_pass_by_value)]
101pub fn map(opts: MapOptions) -> crate::error::Result<Vec<MappedUrl>> {
102 net::ensure_crypto_provider();
103 let seed = net::validate_url(&opts.url)?;
104
105 let include = if opts.include.is_empty() {
106 None
107 } else {
108 Some(crate::scope::build_globset(&opts.include)?)
109 };
110 let exclude = if opts.exclude.is_empty() {
111 None
112 } else {
113 Some(crate::scope::build_globset(&opts.exclude)?)
114 };
115
116 let internal = MapConfig {
117 seed,
118 limit: opts.limit,
119 include,
120 exclude,
121 user_agent: opts.user_agent,
122 timeout: Duration::from_secs(opts.timeout),
123 no_fallback: opts.no_fallback,
124 };
125
126 let mut results = Vec::new();
127 crate::runtime::block_on(run(&internal, |entry| {
128 results.push(MappedUrl {
129 url: entry.url.clone(),
130 lastmod: entry.lastmod.clone(),
131 });
132 }))
133 .map_err(|e| crate::error::Error::Engine(e.to_string()))?;
134 Ok(results)
135}
136
137pub(crate) struct MapConfig {
139 pub seed: Url,
140 pub limit: usize,
141 pub include: Option<globset::GlobSet>,
142 pub exclude: Option<globset::GlobSet>,
143 pub user_agent: Option<String>,
144 pub timeout: Duration,
145 pub no_fallback: bool,
146}
147
148#[derive(serde::Serialize)]
150pub(crate) struct MapEntry {
151 pub url: String,
152 #[serde(skip_serializing_if = "Option::is_none")]
153 pub lastmod: Option<String>,
154}
155
156pub(crate) async fn run(opts: &MapConfig, mut on_url: impl FnMut(&MapEntry)) {
158 let ua = opts
159 .user_agent
160 .as_deref()
161 .unwrap_or_else(|| bridge::default_user_agent());
162 let agent = build_agent(ua, opts.timeout);
163
164 let robots = {
165 let seed = opts.seed.clone();
166 let user_agent = opts.user_agent.clone();
167 let timeout = opts.timeout;
168 tokio::task::spawn_blocking(move || RobotsRules::fetch(&seed, user_agent.as_deref(), timeout))
169 .await
170 .unwrap_or(RobotsPolicy::Unreachable)
171 };
172
173 let mut visited = HashSet::new();
174 let mut count = 0;
175 let mut last_fetch = Instant::now()
176 .checked_sub(MAP_MIN_FETCH_INTERVAL)
177 .unwrap_or_else(Instant::now);
178 let mut sitemap_queue: VecDeque<(Url, u8)> = discover_sitemaps(&robots, &opts.seed)
179 .into_iter()
180 .map(|u| (u, 0))
181 .collect();
182 let mut sitemaps_fetched = 0;
183
184 while let Some((sitemap_url, depth)) = sitemap_queue.pop_front() {
185 if sitemaps_fetched >= MAP_MAX_SITEMAPS || count >= opts.limit {
186 break;
187 }
188 if depth > MAP_MAX_INDEX_DEPTH || !is_same_site(&opts.seed, &sitemap_url) {
189 continue;
190 }
191
192 throttle(&mut last_fetch).await;
193 sitemaps_fetched += 1;
194
195 let body = {
196 let agent = agent.clone();
197 tokio::task::spawn_blocking({
198 let seed = opts.seed.clone();
199 move || fetch_sitemap(&agent, &sitemap_url, &seed)
200 })
201 .await
202 .ok()
203 .flatten()
204 };
205 let Some(body) = body else { continue };
206
207 for entry in parse_sitemap(&body) {
208 match entry {
209 SitemapEntry::Url { loc, lastmod } => {
210 if count >= opts.limit {
211 break;
212 }
213 if let Some(e) = validate_entry(&loc, lastmod, &opts.seed, &robots, opts, &mut visited) {
214 on_url(&e);
215 count += 1;
216 }
217 }
218 SitemapEntry::Sitemap { loc } => {
219 if let Ok(url) = Url::parse(&loc) {
220 sitemap_queue.push_back((url, depth + 1));
221 }
222 }
223 }
224 }
225 }
226
227 if count == 0 && !opts.no_fallback {
228 throttle(&mut last_fetch).await;
229 let html = {
230 let agent = agent.clone();
231 let seed = opts.seed.clone();
232 tokio::task::spawn_blocking(move || fetch_html(&agent, &seed))
233 .await
234 .ok()
235 .flatten()
236 };
237 if let Some(html) = html {
238 for link in extract_links(&html, &opts.seed) {
239 if count >= opts.limit {
240 break;
241 }
242 if let Some(e) = validate_entry(link.as_str(), None, &opts.seed, &robots, opts, &mut visited) {
243 on_url(&e);
244 count += 1;
245 }
246 }
247 }
248 }
249}
250
251fn discover_sitemaps(robots: &RobotsPolicy, seed: &Url) -> Vec<Url> {
252 let mut urls = Vec::new();
253 if let RobotsPolicy::Rules(rules) = robots {
254 urls.extend(rules.sitemaps.iter().cloned());
255 }
256 if let Ok(default) = seed.join("/sitemap.xml") {
257 if !urls.contains(&default) {
258 urls.push(default);
259 }
260 }
261 urls
262}
263
264fn build_agent(ua: &str, timeout: Duration) -> ureq::Agent {
265 ureq::Agent::new_with_config(
266 ureq::config::Config::builder()
267 .max_redirects(0)
268 .http_status_as_error(false)
269 .timeout_global(Some(timeout))
270 .user_agent(ua)
271 .build(),
272 )
273}
274
275fn fetch_following_redirects(agent: &ureq::Agent, url: &Url, seed: &Url) -> Option<ureq::http::Response<ureq::Body>> {
276 let mut current = url.clone();
277 for _ in 0..MAP_MAX_REDIRECTS {
278 let resp = agent.get(current.as_str()).call().ok()?;
279 let status = resp.status().as_u16();
280 if matches!(status, 301 | 302 | 303 | 307 | 308) {
281 let location = resp.headers().get("location")?.to_str().ok()?;
282 let next = current.join(location).ok()?;
283 if net::validate_url_with_policy(next.as_str(), bridge::engine_policy()).is_err()
284 || !is_same_site(seed, &next)
285 {
286 return None;
287 }
288 current = next;
289 continue;
290 }
291 if status >= 400 {
292 return None;
293 }
294 return Some(resp);
295 }
296 None
297}
298
299fn fetch_sitemap(agent: &ureq::Agent, url: &Url, seed: &Url) -> Option<String> {
300 let resp = fetch_following_redirects(agent, url, seed)?;
301 let content_type = resp
302 .headers()
303 .get("content-type")
304 .and_then(|v| v.to_str().ok())
305 .unwrap_or("");
306
307 let is_gzip = url
308 .path()
309 .rsplit('/')
310 .next()
311 .and_then(|seg| std::path::Path::new(seg).extension())
312 .is_some_and(|ext| ext.eq_ignore_ascii_case("gz"))
313 || content_type.contains("gzip")
314 || resp
315 .headers()
316 .get("content-encoding")
317 .and_then(|v| v.to_str().ok())
318 .is_some_and(|v| v.contains("gzip"));
319
320 if is_gzip {
321 let bytes = resp
322 .into_body()
323 .with_config()
324 .limit(MAP_SITEMAP_MAX_BYTES)
325 .read_to_vec()
326 .ok()?;
327 let mut decoded = Vec::new();
328 flate2::read::GzDecoder::new(bytes.as_slice())
329 .take(MAP_SITEMAP_MAX_DECOMPRESSED)
330 .read_to_end(&mut decoded)
331 .ok()?;
332 if decoded.len() as u64 > bytes.len() as u64 * MAP_GZIP_MAX_RATIO {
333 return None;
334 }
335 if looks_like_html(&decoded) {
336 return None;
337 }
338 String::from_utf8(decoded).ok()
339 } else {
340 let body = resp
341 .into_body()
342 .with_config()
343 .limit(MAP_SITEMAP_MAX_BYTES)
344 .read_to_string()
345 .ok()?;
346 if looks_like_html(body.as_bytes()) {
347 return None;
348 }
349 Some(body)
350 }
351}
352
353fn looks_like_html(bytes: &[u8]) -> bool {
354 const DOCTYPE: &[u8] = b"<!doctype";
355 const HTML: &[u8] = b"<html";
356 const BOM: &[u8] = b"\xef\xbb\xbf";
357 let mut prefix = bytes.get(..HTML_SNIFF_LEN).unwrap_or(bytes);
358 if prefix.starts_with(BOM) {
359 prefix = &prefix[BOM.len()..];
360 }
361 let prefix = prefix
362 .iter()
363 .position(|b| !b.is_ascii_whitespace())
364 .map_or(&[][..], |i| &prefix[i..]);
365 prefix
366 .get(..DOCTYPE.len())
367 .is_some_and(|p| p.eq_ignore_ascii_case(DOCTYPE))
368 || prefix.get(..HTML.len()).is_some_and(|p| p.eq_ignore_ascii_case(HTML))
369}
370
371fn fetch_html(agent: &ureq::Agent, url: &Url) -> Option<String> {
372 let resp = fetch_following_redirects(agent, url, url)?;
373 resp.into_body()
374 .with_config()
375 .limit(MAP_HTML_MAX_BYTES)
376 .read_to_string()
377 .ok()
378}
379
380fn extract_links(html: &str, base: &Url) -> Vec<Url> {
381 dom_query::Document::from(html)
382 .select("a[href]")
383 .iter()
384 .filter_map(|el| {
385 let href = el.attr("href")?;
386 let href = href.trim();
387 if href.is_empty() {
388 return None;
389 }
390 let resolved = base.join(href).ok()?;
391 matches!(resolved.scheme(), "http" | "https").then_some(resolved)
392 })
393 .collect()
394}
395
396enum SitemapEntry {
397 Url { loc: String, lastmod: Option<String> },
398 Sitemap { loc: String },
399}
400
401fn parse_sitemap(body: &str) -> Vec<SitemapEntry> {
402 use quick_xml::events::Event;
403 use quick_xml::reader::Reader;
404
405 let mut reader = Reader::from_str(body);
406 let mut entries = Vec::new();
407 let mut buf = Vec::new();
408 let mut capture = Capture::Idle;
409 let mut loc = String::new();
410 let mut lastmod = String::new();
411 let mut in_url = false;
412 let mut in_sitemap = false;
413 let mut depth: u32 = 0;
414
415 loop {
416 match reader.read_event_into(&mut buf) {
417 Ok(Event::Start(e)) => {
418 let name = e.local_name();
419 match name.as_ref() {
420 b"url" => {
421 in_url = true;
422 depth = 0;
423 }
424 b"sitemap" => {
425 in_sitemap = true;
426 depth = 0;
427 }
428 b"loc" if (in_url || in_sitemap) && depth == 0 => capture = Capture::Loc,
429 b"lastmod" if in_url && depth == 0 => capture = Capture::Lastmod,
430 _ if in_url || in_sitemap => depth += 1,
431 _ => {}
432 }
433 }
434 Ok(Event::Text(e)) => {
435 if let Ok(text) = e.xml10_content() {
436 match capture {
437 Capture::Loc => loc.push_str(text.trim()),
438 Capture::Lastmod => lastmod.push_str(text.trim()),
439 Capture::Idle => {}
440 }
441 } else {
442 loc.clear();
443 lastmod.clear();
444 capture = Capture::Idle;
445 }
446 }
447 Ok(Event::GeneralRef(e)) => {
448 let resolved = match &*e {
449 b"amp" => "&",
450 b"lt" => "<",
451 b"gt" => ">",
452 b"quot" => "\"",
453 b"apos" => "'",
454 _ => "",
455 };
456 match capture {
457 Capture::Loc => loc.push_str(resolved),
458 Capture::Lastmod => lastmod.push_str(resolved),
459 Capture::Idle => {}
460 }
461 }
462 Ok(Event::End(e)) => {
463 let name = e.local_name();
464 match name.as_ref() {
465 b"url" if in_url => {
466 if !loc.is_empty() {
467 let lm = if lastmod.is_empty() {
468 None
469 } else {
470 Some(std::mem::take(&mut lastmod))
471 };
472 entries.push(SitemapEntry::Url {
473 loc: std::mem::take(&mut loc),
474 lastmod: lm,
475 });
476 }
477 loc.clear();
478 lastmod.clear();
479 in_url = false;
480 }
481 b"sitemap" if in_sitemap => {
482 if !loc.is_empty() {
483 entries.push(SitemapEntry::Sitemap {
484 loc: std::mem::take(&mut loc),
485 });
486 }
487 loc.clear();
488 lastmod.clear();
489 in_sitemap = false;
490 }
491 b"loc" | b"lastmod" if capture != Capture::Idle => capture = Capture::Idle,
492 _ if depth > 0 => depth -= 1,
493 _ => {}
494 }
495 }
496 Ok(Event::Eof) | Err(_) => break,
497 _ => {}
498 }
499 buf.clear();
500 }
501
502 entries
503}
504
505#[derive(Clone, Copy, Debug, PartialEq, Eq)]
506enum Capture {
507 Idle,
508 Loc,
509 Lastmod,
510}
511
512fn validate_entry(
513 loc: &str,
514 lastmod: Option<String>,
515 seed: &Url,
516 robots: &RobotsPolicy,
517 opts: &MapConfig,
518 visited: &mut HashSet<String>,
519) -> Option<MapEntry> {
520 if loc.len() > MAP_URL_MAX_LEN {
521 return None;
522 }
523 let url = Url::parse(loc)
524 .ok()
525 .filter(|u| matches!(u.scheme(), "http" | "https"))?;
526 if !is_same_site(seed, &url) {
527 return None;
528 }
529 if !robots.is_allowed(&url) {
530 return None;
531 }
532 if !matches_scope(&url, opts.include.as_ref(), opts.exclude.as_ref()) {
533 return None;
534 }
535 let normalized = normalize_url(&url);
536 if !visited.insert(normalized.clone()) {
537 return None;
538 }
539 Some(MapEntry {
540 url: normalized,
541 lastmod,
542 })
543}
544
545async fn throttle(last_fetch: &mut Instant) {
546 let elapsed = last_fetch.elapsed();
547 if elapsed < MAP_MIN_FETCH_INTERVAL {
548 tokio::time::sleep(MAP_MIN_FETCH_INTERVAL.saturating_sub(elapsed)).await;
549 }
550 *last_fetch = Instant::now();
551}
552
553#[cfg(test)]
554mod tests {
555 use super::*;
556
557 fn test_config(seed: &str) -> MapConfig {
558 MapConfig {
559 seed: Url::parse(seed).unwrap(),
560 limit: 100,
561 include: None,
562 exclude: None,
563 user_agent: None,
564 timeout: Duration::from_secs(30),
565 no_fallback: false,
566 }
567 }
568
569 #[test]
570 fn parse_urlset() {
571 let xml = r#"<?xml version="1.0"?>
572<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
573 <url><loc>https://example.com/a</loc><lastmod>2026-01-01</lastmod></url>
574 <url><loc>https://example.com/b</loc></url>
575</urlset>"#;
576 let entries = parse_sitemap(xml);
577 assert_eq!(entries.len(), 2);
578 match &entries[0] {
579 SitemapEntry::Url { loc, lastmod } => {
580 assert_eq!(loc, "https://example.com/a");
581 assert_eq!(lastmod.as_deref(), Some("2026-01-01"));
582 }
583 SitemapEntry::Sitemap { .. } => panic!("expected Url"),
584 }
585 match &entries[1] {
586 SitemapEntry::Url { loc, lastmod } => {
587 assert_eq!(loc, "https://example.com/b");
588 assert!(lastmod.is_none());
589 }
590 SitemapEntry::Sitemap { .. } => panic!("expected Url"),
591 }
592 }
593
594 #[test]
595 fn parse_sitemapindex() {
596 let xml = r#"<?xml version="1.0"?>
597<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
598 <sitemap><loc>https://example.com/sitemap1.xml</loc></sitemap>
599 <sitemap><loc>https://example.com/sitemap2.xml</loc></sitemap>
600</sitemapindex>"#;
601 let entries = parse_sitemap(xml);
602 assert_eq!(entries.len(), 2);
603 match &entries[0] {
604 SitemapEntry::Sitemap { loc } => assert_eq!(loc, "https://example.com/sitemap1.xml"),
605 SitemapEntry::Url { .. } => panic!("expected Sitemap"),
606 }
607 }
608
609 #[test]
610 fn parse_handles_xml_entities() {
611 let xml = r"<urlset><url><loc>https://example.com/a?b=1&c=2</loc></url></urlset>";
612 let entries = parse_sitemap(xml);
613 assert_eq!(entries.len(), 1);
614 match &entries[0] {
615 SitemapEntry::Url { loc, .. } => assert_eq!(loc, "https://example.com/a?b=1&c=2"),
616 SitemapEntry::Sitemap { .. } => panic!("expected Url"),
617 }
618 }
619
620 #[test]
621 fn parse_handles_namespaced_tags() {
622 let xml = r#"<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
623 xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">
624 <url>
625 <loc>https://example.com/page</loc>
626 <image:image><image:loc>https://example.com/img.png</image:loc></image:image>
627 </url>
628</urlset>"#;
629 let entries = parse_sitemap(xml);
630 assert_eq!(entries.len(), 1);
631 match &entries[0] {
632 SitemapEntry::Url { loc, .. } => assert_eq!(loc, "https://example.com/page"),
633 SitemapEntry::Sitemap { .. } => panic!("expected Url"),
634 }
635 }
636
637 #[test]
638 fn parse_loc_after_nested_extension() {
639 let xml = r#"<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
640 xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">
641 <url>
642 <image:image><image:loc>https://example.com/img.png</image:loc></image:image>
643 <loc>https://example.com/page</loc>
644 <lastmod>2026-01-01</lastmod>
645 </url>
646</urlset>"#;
647 let entries = parse_sitemap(xml);
648 assert_eq!(entries.len(), 1);
649 match &entries[0] {
650 SitemapEntry::Url { loc, lastmod } => {
651 assert_eq!(loc, "https://example.com/page");
652 assert_eq!(lastmod.as_deref(), Some("2026-01-01"));
653 }
654 SitemapEntry::Sitemap { .. } => panic!("expected Url"),
655 }
656 }
657
658 #[test]
659 fn parse_empty_body_returns_empty() {
660 assert!(parse_sitemap("").is_empty());
661 assert!(parse_sitemap("<html><body>Not Found</body></html>").is_empty());
662 }
663
664 #[test]
665 fn looks_like_html_detects_variants() {
666 assert!(looks_like_html(b"<!DOCTYPE html>"));
667 assert!(looks_like_html(b"<!doctype html>"));
668 assert!(looks_like_html(b"<html lang=\"en\">"));
669 assert!(looks_like_html(b"<HTML>"));
670 assert!(looks_like_html(b"\xef\xbb\xbf<!DOCTYPE html>"));
671 assert!(looks_like_html(b" \n<!doctype html>"));
672 assert!(looks_like_html(b"\xef\xbb\xbf <html>"));
673 assert!(!looks_like_html(b"<?xml version=\"1.0\"?>"));
674 assert!(!looks_like_html(b"<urlset>"));
675 assert!(!looks_like_html(b""));
676 }
677
678 #[test]
679 fn validate_entry_rejects_private_ip() {
680 let opts = test_config("https://example.com");
681
682 let mut visited = HashSet::new();
683 let result = validate_entry(
684 "http://127.0.0.1/secret",
685 None,
686 &opts.seed,
687 &RobotsPolicy::Unavailable,
688 &opts,
689 &mut visited,
690 );
691 assert!(result.is_none());
692 }
693
694 #[test]
695 fn validate_entry_rejects_cross_site() {
696 let opts = test_config("https://example.com");
697 let robots = RobotsPolicy::Unavailable;
698 let mut visited = HashSet::new();
699 let result = validate_entry("https://evil.com/page", None, &opts.seed, &robots, &opts, &mut visited);
700 assert!(result.is_none());
701 }
702
703 #[test]
704 fn validate_entry_deduplicates() {
705 let opts = test_config("https://example.com");
706 let robots = RobotsPolicy::Unavailable;
707 let mut visited = HashSet::new();
708 let first = validate_entry(
709 "https://example.com/page",
710 None,
711 &opts.seed,
712 &robots,
713 &opts,
714 &mut visited,
715 );
716 assert!(first.is_some());
717 let second = validate_entry(
718 "https://example.com/page",
719 None,
720 &opts.seed,
721 &robots,
722 &opts,
723 &mut visited,
724 );
725 assert!(second.is_none());
726 }
727
728 #[test]
729 fn validate_entry_rejects_long_url() {
730 let opts = test_config("https://example.com");
731 let robots = RobotsPolicy::Unavailable;
732 let mut visited = HashSet::new();
733 let long_url = format!("https://example.com/{}", "a".repeat(MAP_URL_MAX_LEN));
734 let result = validate_entry(&long_url, None, &opts.seed, &robots, &opts, &mut visited);
735 assert!(result.is_none());
736 }
737
738 #[test]
739 fn discover_sitemaps_includes_robots_and_default() {
740 let seed = Url::parse("https://example.com").unwrap();
741 let robots = RobotsPolicy::Rules(RobotsRules {
742 rules: Vec::new(),
743 sitemaps: vec![Url::parse("https://example.com/custom-sitemap.xml").unwrap()],
744 });
745 let sitemaps = discover_sitemaps(&robots, &seed);
746 assert_eq!(sitemaps.len(), 2);
747 assert_eq!(sitemaps[0].as_str(), "https://example.com/custom-sitemap.xml");
748 assert_eq!(sitemaps[1].as_str(), "https://example.com/sitemap.xml");
749 }
750
751 #[test]
752 fn discover_sitemaps_deduplicates_default() {
753 let seed = Url::parse("https://example.com").unwrap();
754 let robots = RobotsPolicy::Rules(RobotsRules {
755 rules: Vec::new(),
756 sitemaps: vec![Url::parse("https://example.com/sitemap.xml").unwrap()],
757 });
758 let sitemaps = discover_sitemaps(&robots, &seed);
759 assert_eq!(sitemaps.len(), 1);
760 }
761
762 mod integration {
763 use super::super::*;
764 use wiremock::matchers::{method, path};
765 use wiremock::{Mock, MockServer, ResponseTemplate};
766
767 #[tokio::test]
768 async fn fetch_sitemap_parses_urlset() {
769 let server = MockServer::start().await;
770 let xml = r#"<?xml version="1.0"?><urlset><url><loc>https://example.com/a</loc></url></urlset>"#;
771 Mock::given(method("GET"))
772 .and(path("/sitemap.xml"))
773 .respond_with(ResponseTemplate::new(200).set_body_raw(xml.as_bytes().to_vec(), "application/xml"))
774 .mount(&server)
775 .await;
776
777 let agent = build_agent("test/1.0", Duration::from_secs(5));
778 let url = Url::parse(&format!("{}/sitemap.xml", server.uri())).unwrap();
779 let body = tokio::task::spawn_blocking(move || fetch_sitemap(&agent, &url, &url))
780 .await
781 .unwrap();
782
783 let entries = parse_sitemap(&body.unwrap());
784 assert_eq!(entries.len(), 1);
785 }
786
787 #[tokio::test]
788 async fn fetch_sitemap_rejects_html_error_page() {
789 let server = MockServer::start().await;
790 Mock::given(method("GET"))
791 .and(path("/sitemap.xml"))
792 .respond_with(ResponseTemplate::new(200).set_body_raw(
793 b"<!DOCTYPE html><html><body>Not Found</body></html>".to_vec(),
794 "text/html; charset=utf-8",
795 ))
796 .mount(&server)
797 .await;
798
799 let agent = build_agent("test/1.0", Duration::from_secs(5));
800 let url = Url::parse(&format!("{}/sitemap.xml", server.uri())).unwrap();
801 let body = tokio::task::spawn_blocking(move || fetch_sitemap(&agent, &url, &url))
802 .await
803 .unwrap();
804
805 assert!(body.is_none());
806 }
807
808 #[tokio::test]
809 async fn fetch_sitemap_returns_none_on_404() {
810 let server = MockServer::start().await;
811 Mock::given(method("GET"))
812 .and(path("/sitemap.xml"))
813 .respond_with(ResponseTemplate::new(404))
814 .mount(&server)
815 .await;
816
817 let agent = build_agent("test/1.0", Duration::from_secs(5));
818 let url = Url::parse(&format!("{}/sitemap.xml", server.uri())).unwrap();
819 let body = tokio::task::spawn_blocking(move || fetch_sitemap(&agent, &url, &url))
820 .await
821 .unwrap();
822
823 assert!(body.is_none());
824 }
825
826 #[tokio::test]
827 async fn fetch_sitemap_handles_gzip() {
828 use flate2::Compression;
829 use flate2::write::GzEncoder;
830 use std::io::Write;
831
832 let server = MockServer::start().await;
833 let xml = r#"<?xml version="1.0"?><urlset><url><loc>https://example.com/gz</loc></url></urlset>"#;
834 let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
835 encoder.write_all(xml.as_bytes()).unwrap();
836 let compressed = encoder.finish().unwrap();
837
838 Mock::given(method("GET"))
839 .and(path("/sitemap.xml.gz"))
840 .respond_with(ResponseTemplate::new(200).set_body_raw(compressed, "application/gzip"))
841 .mount(&server)
842 .await;
843
844 let agent = build_agent("test/1.0", Duration::from_secs(5));
845 let url = Url::parse(&format!("{}/sitemap.xml.gz", server.uri())).unwrap();
846 let body = tokio::task::spawn_blocking(move || fetch_sitemap(&agent, &url, &url))
847 .await
848 .unwrap();
849
850 let entries = parse_sitemap(&body.unwrap());
851 assert_eq!(entries.len(), 1);
852 }
853
854 #[tokio::test]
855 async fn fetch_html_extracts_links() {
856 let server = MockServer::start().await;
857 Mock::given(method("GET"))
858 .and(path("/"))
859 .respond_with(ResponseTemplate::new(200).set_body_raw(
860 br#"<html><body><a href="/link">x</a></body></html>"#.to_vec(),
861 "text/html; charset=utf-8",
862 ))
863 .mount(&server)
864 .await;
865
866 let agent = build_agent("test/1.0", Duration::from_secs(5));
867 let seed = Url::parse(&server.uri()).unwrap();
868 let html = tokio::task::spawn_blocking({
869 let seed = seed.clone();
870 move || fetch_html(&agent, &seed)
871 })
872 .await
873 .unwrap()
874 .unwrap();
875
876 let links = extract_links(&html, &seed);
877 assert_eq!(links.len(), 1);
878 }
879
880 async fn check_run(server: &MockServer, configure: impl FnOnce(&mut MapConfig)) -> Vec<MapEntry> {
881 let mut config = MapConfig {
882 seed: Url::parse(&server.uri()).unwrap(),
883 limit: 100,
884 include: None,
885 exclude: None,
886 user_agent: Some("test-bot".into()),
887 timeout: Duration::from_secs(5),
888 no_fallback: false,
889 };
890 configure(&mut config);
891 let mut entries = Vec::new();
892 run(&config, |e| {
893 entries.push(MapEntry {
894 url: e.url.clone(),
895 lastmod: e.lastmod.clone(),
896 });
897 })
898 .await;
899 entries
900 }
901
902 #[tokio::test]
903 async fn run_discovers_urls_from_sitemap() {
904 let server = MockServer::start().await;
905 Mock::given(method("GET"))
906 .and(path("/robots.txt"))
907 .respond_with(ResponseTemplate::new(200).set_body_string("User-agent: *\nAllow: /"))
908 .mount(&server)
909 .await;
910 let sitemap = format!(
911 "<urlset><url><loc>{}/page1</loc></url><url><loc>{}/page2</loc></url></urlset>",
912 server.uri(),
913 server.uri()
914 );
915 Mock::given(method("GET"))
916 .and(path("/sitemap.xml"))
917 .respond_with(ResponseTemplate::new(200).set_body_string(sitemap))
918 .mount(&server)
919 .await;
920
921 let entries = check_run(&server, |_| {}).await;
922 assert_eq!(entries.len(), 2);
923 assert!(entries.iter().any(|e| e.url.ends_with("/page1")));
924 assert!(entries.iter().any(|e| e.url.ends_with("/page2")));
925 }
926
927 #[tokio::test]
928 async fn run_respects_limit() {
929 let server = MockServer::start().await;
930 Mock::given(method("GET"))
931 .and(path("/robots.txt"))
932 .respond_with(ResponseTemplate::new(404))
933 .mount(&server)
934 .await;
935 let sitemap = format!(
936 "<urlset><url><loc>{}/a</loc></url><url><loc>{}/b</loc></url><url><loc>{}/c</loc></url></urlset>",
937 server.uri(),
938 server.uri(),
939 server.uri()
940 );
941 Mock::given(method("GET"))
942 .and(path("/sitemap.xml"))
943 .respond_with(ResponseTemplate::new(200).set_body_string(sitemap))
944 .mount(&server)
945 .await;
946
947 let entries = check_run(&server, |c| c.limit = 2).await;
948 assert_eq!(entries.len(), 2);
949 }
950
951 #[tokio::test]
952 async fn run_follows_sitemap_index() {
953 let server = MockServer::start().await;
954 Mock::given(method("GET"))
955 .and(path("/robots.txt"))
956 .respond_with(ResponseTemplate::new(404))
957 .mount(&server)
958 .await;
959 let index = format!(
960 "<sitemapindex><sitemap><loc>{}/sub.xml</loc></sitemap></sitemapindex>",
961 server.uri()
962 );
963 Mock::given(method("GET"))
964 .and(path("/sitemap.xml"))
965 .respond_with(ResponseTemplate::new(200).set_body_string(index))
966 .mount(&server)
967 .await;
968 let sub = format!("<urlset><url><loc>{}/deep</loc></url></urlset>", server.uri());
969 Mock::given(method("GET"))
970 .and(path("/sub.xml"))
971 .respond_with(ResponseTemplate::new(200).set_body_string(sub))
972 .mount(&server)
973 .await;
974
975 let entries = check_run(&server, |_| {}).await;
976 assert_eq!(entries.len(), 1);
977 assert!(entries[0].url.ends_with("/deep"));
978 }
979
980 #[tokio::test]
981 async fn run_falls_back_to_html_links() {
982 let server = MockServer::start().await;
983 Mock::given(method("GET"))
984 .and(path("/robots.txt"))
985 .respond_with(ResponseTemplate::new(404))
986 .mount(&server)
987 .await;
988 Mock::given(method("GET"))
989 .and(path("/sitemap.xml"))
990 .respond_with(ResponseTemplate::new(404))
991 .mount(&server)
992 .await;
993 let html = format!(
994 r#"<html><body><a href="{}/link1">L1</a><a href="{}/link2">L2</a></body></html>"#,
995 server.uri(),
996 server.uri()
997 );
998 Mock::given(method("GET"))
999 .and(path("/"))
1000 .respond_with(ResponseTemplate::new(200).set_body_string(html))
1001 .mount(&server)
1002 .await;
1003
1004 let entries = check_run(&server, |_| {}).await;
1005 assert_eq!(entries.len(), 2);
1006 }
1007
1008 #[tokio::test]
1009 async fn run_no_fallback_skips_html() {
1010 let server = MockServer::start().await;
1011 Mock::given(method("GET"))
1012 .and(path("/robots.txt"))
1013 .respond_with(ResponseTemplate::new(404))
1014 .mount(&server)
1015 .await;
1016 Mock::given(method("GET"))
1017 .and(path("/sitemap.xml"))
1018 .respond_with(ResponseTemplate::new(404))
1019 .mount(&server)
1020 .await;
1021 Mock::given(method("GET"))
1022 .and(path("/"))
1023 .respond_with(
1024 ResponseTemplate::new(200).set_body_string(r#"<html><body><a href="/link">L</a></body></html>"#),
1025 )
1026 .mount(&server)
1027 .await;
1028
1029 let entries = check_run(&server, |c| c.no_fallback = true).await;
1030 assert_eq!(entries.len(), 0);
1031 }
1032
1033 #[tokio::test]
1034 async fn run_deduplicates_urls() {
1035 let server = MockServer::start().await;
1036 Mock::given(method("GET"))
1037 .and(path("/robots.txt"))
1038 .respond_with(ResponseTemplate::new(404))
1039 .mount(&server)
1040 .await;
1041 let sitemap = format!(
1042 "<urlset><url><loc>{}/dup</loc></url><url><loc>{}/dup</loc></url><url><loc>{}/unique</loc></url></urlset>",
1043 server.uri(),
1044 server.uri(),
1045 server.uri()
1046 );
1047 Mock::given(method("GET"))
1048 .and(path("/sitemap.xml"))
1049 .respond_with(ResponseTemplate::new(200).set_body_string(sitemap))
1050 .mount(&server)
1051 .await;
1052
1053 let entries = check_run(&server, |_| {}).await;
1054 assert_eq!(entries.len(), 2);
1055 }
1056 }
1057}