1use std::collections::{HashSet, VecDeque};
4use std::io::Read as _;
5use std::time::{Duration, Instant};
6
7use tokio::task::spawn_blocking;
8use url::Url;
9
10use crate::robots::{RobotsPolicy, RobotsRules};
11use crate::scope::{is_same_site, matches_scope, normalize_url};
12use crate::{bridge, net};
13
14const MAP_SITEMAP_MAX_BYTES: u64 = 50 * 1024 * 1024;
15const MAP_SITEMAP_MAX_DECOMPRESSED: u64 = 10 * 1024 * 1024;
16const MAP_GZIP_MAX_RATIO: u64 = 100;
17const MAP_HTML_MAX_BYTES: u64 = 2 * 1024 * 1024;
18const MAP_MAX_REDIRECTS: u8 = 5;
19const MAP_MAX_SITEMAPS: usize = 200;
20const MAP_MAX_INDEX_DEPTH: u8 = 5;
21const MAP_MIN_FETCH_INTERVAL: Duration = Duration::from_millis(500);
22const MAP_URL_MAX_LEN: usize = 2048;
23const HTML_SNIFF_LEN: usize = 100;
24
25#[must_use = "options do nothing until passed to map()"]
27#[derive(Debug, Clone)]
28pub struct MapOptions {
29 url: String,
30 limit: usize,
31 include: Vec<String>,
32 exclude: Vec<String>,
33 user_agent: Option<String>,
34 timeout: u64,
35 no_fallback: bool,
36}
37
38impl MapOptions {
39 pub fn new(url: impl Into<String>) -> Self {
41 Self {
42 url: url.into(),
43 limit: 5000,
44 include: Vec::new(),
45 exclude: Vec::new(),
46 user_agent: None,
47 timeout: 30,
48 no_fallback: false,
49 }
50 }
51
52 pub fn limit(mut self, n: usize) -> Self {
54 self.limit = n;
55 self
56 }
57
58 pub fn include(mut self, patterns: &[&str]) -> Self {
60 self.include = patterns.iter().map(|s| (*s).to_string()).collect();
61 self
62 }
63
64 pub fn exclude(mut self, patterns: &[&str]) -> Self {
66 self.exclude = patterns.iter().map(|s| (*s).to_string()).collect();
67 self
68 }
69
70 pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
72 self.user_agent = Some(ua.into());
73 self
74 }
75
76 pub fn timeout(mut self, secs: u64) -> Self {
78 self.timeout = secs;
79 self
80 }
81
82 pub fn no_fallback(mut self, yes: bool) -> Self {
84 self.no_fallback = yes;
85 self
86 }
87}
88
89#[derive(Debug, Clone, serde::Serialize)]
91pub struct MappedUrl {
92 pub url: String,
94 #[serde(skip_serializing_if = "Option::is_none")]
96 pub lastmod: Option<String>,
97}
98
99#[allow(clippy::needless_pass_by_value)]
101pub fn map(opts: MapOptions) -> crate::error::Result<Vec<MappedUrl>> {
102 net::ensure_crypto_provider();
103 let seed = net::validate_url(&opts.url)?;
104
105 let include = if opts.include.is_empty() {
106 None
107 } else {
108 Some(crate::scope::build_globset(&opts.include)?)
109 };
110 let exclude = if opts.exclude.is_empty() {
111 None
112 } else {
113 Some(crate::scope::build_globset(&opts.exclude)?)
114 };
115
116 let internal = MapConfig {
117 seed,
118 limit: opts.limit,
119 include,
120 exclude,
121 user_agent: opts.user_agent,
122 timeout: Duration::from_secs(opts.timeout),
123 no_fallback: opts.no_fallback,
124 };
125
126 let mut results = Vec::new();
127 crate::runtime::block_on(run(&internal, |entry| {
128 results.push(MappedUrl {
129 url: entry.url.clone(),
130 lastmod: entry.lastmod.clone(),
131 });
132 }))
133 .map_err(|e| crate::error::Error::engine(e, None))?;
134 Ok(results)
135}
136
137pub(crate) struct MapConfig {
139 pub seed: Url,
140 pub limit: usize,
141 pub include: Option<globset::GlobSet>,
142 pub exclude: Option<globset::GlobSet>,
143 pub user_agent: Option<String>,
144 pub timeout: Duration,
145 pub no_fallback: bool,
146}
147
148#[derive(serde::Serialize)]
150pub(crate) struct MapEntry {
151 pub url: String,
152 #[serde(skip_serializing_if = "Option::is_none")]
153 pub lastmod: Option<String>,
154}
155
156pub(crate) async fn run(opts: &MapConfig, mut on_url: impl FnMut(&MapEntry)) {
158 let ua = opts
159 .user_agent
160 .as_deref()
161 .unwrap_or_else(|| bridge::default_user_agent());
162 let agent = build_agent(ua, opts.timeout);
163
164 let robots = {
165 let seed = opts.seed.clone();
166 let user_agent = opts.user_agent.clone();
167 let timeout = opts.timeout;
168 spawn_blocking(move || RobotsRules::fetch(&seed, user_agent.as_deref(), timeout))
169 .await
170 .unwrap_or(RobotsPolicy::Unreachable)
171 };
172
173 let mut visited = HashSet::new();
174 let mut count = 0;
175 let mut last_fetch = Instant::now()
176 .checked_sub(MAP_MIN_FETCH_INTERVAL)
177 .unwrap_or_else(Instant::now);
178 let mut sitemap_queue: VecDeque<(Url, u8)> = discover_sitemaps(&robots, &opts.seed)
179 .into_iter()
180 .map(|u| (u, 0))
181 .collect();
182 let mut sitemaps_fetched = 0;
183
184 while let Some((sitemap_url, depth)) = sitemap_queue.pop_front() {
185 if sitemaps_fetched >= MAP_MAX_SITEMAPS || count >= opts.limit {
186 break;
187 }
188 if depth > MAP_MAX_INDEX_DEPTH || !is_same_site(&opts.seed, &sitemap_url) {
189 continue;
190 }
191
192 throttle(&mut last_fetch).await;
193 sitemaps_fetched += 1;
194
195 let body = {
196 let agent = agent.clone();
197 spawn_blocking({
198 let seed = opts.seed.clone();
199 move || fetch_sitemap(&agent, &sitemap_url, &seed)
200 })
201 .await
202 .ok()
203 .flatten()
204 };
205 let Some(body) = body else { continue };
206
207 for entry in parse_sitemap(&body) {
208 match entry {
209 SitemapEntry::Url { loc, lastmod } => {
210 if count >= opts.limit {
211 break;
212 }
213 if let Some(e) = validate_entry(&loc, lastmod, &opts.seed, &robots, opts, &mut visited) {
214 on_url(&e);
215 count += 1;
216 }
217 }
218 SitemapEntry::Sitemap { loc } => {
219 if let Ok(url) = Url::parse(&loc) {
220 sitemap_queue.push_back((url, depth + 1));
221 }
222 }
223 }
224 }
225 }
226
227 if count == 0 && !opts.no_fallback {
228 throttle(&mut last_fetch).await;
229 let html = {
230 let agent = agent.clone();
231 let seed = opts.seed.clone();
232 spawn_blocking(move || fetch_html(&agent, &seed)).await.ok().flatten()
233 };
234 if let Some(html) = html {
235 for link in extract_links(&html, &opts.seed) {
236 if count >= opts.limit {
237 break;
238 }
239 if let Some(e) = validate_entry(link.as_str(), None, &opts.seed, &robots, opts, &mut visited) {
240 on_url(&e);
241 count += 1;
242 }
243 }
244 }
245 }
246}
247
248fn discover_sitemaps(robots: &RobotsPolicy, seed: &Url) -> Vec<Url> {
249 let mut urls = Vec::new();
250 if let RobotsPolicy::Rules(rules) = robots {
251 urls.extend(rules.sitemaps.iter().cloned());
252 }
253 if let Ok(default) = seed.join("/sitemap.xml") {
254 if !urls.contains(&default) {
255 urls.push(default);
256 }
257 }
258 urls
259}
260
261fn build_agent(ua: &str, timeout: Duration) -> ureq::Agent {
262 ureq::Agent::new_with_config(
263 ureq::config::Config::builder()
264 .max_redirects(0)
265 .http_status_as_error(false)
266 .timeout_global(Some(timeout))
267 .user_agent(ua)
268 .build(),
269 )
270}
271
272fn fetch_following_redirects(agent: &ureq::Agent, url: &Url, seed: &Url) -> Option<ureq::http::Response<ureq::Body>> {
273 let mut current = url.clone();
274 for _ in 0..MAP_MAX_REDIRECTS {
275 let resp = agent.get(current.as_str()).call().ok()?;
276 let status = resp.status().as_u16();
277 if matches!(status, 301 | 302 | 303 | 307 | 308) {
278 let location = resp.headers().get("location")?.to_str().ok()?;
279 let next = current.join(location).ok()?;
280 if net::validate_url_with_policy(next.as_str(), bridge::engine_policy()).is_err()
281 || !is_same_site(seed, &next)
282 {
283 return None;
284 }
285 current = next;
286 continue;
287 }
288 if status >= 400 {
289 return None;
290 }
291 return Some(resp);
292 }
293 None
294}
295
296fn fetch_sitemap(agent: &ureq::Agent, url: &Url, seed: &Url) -> Option<String> {
297 let resp = fetch_following_redirects(agent, url, seed)?;
298 let content_type = resp
299 .headers()
300 .get("content-type")
301 .and_then(|v| v.to_str().ok())
302 .unwrap_or("");
303
304 let is_gzip = url
305 .path()
306 .rsplit('/')
307 .next()
308 .and_then(|seg| std::path::Path::new(seg).extension())
309 .is_some_and(|ext| ext.eq_ignore_ascii_case("gz"))
310 || content_type.contains("gzip")
311 || resp
312 .headers()
313 .get("content-encoding")
314 .and_then(|v| v.to_str().ok())
315 .is_some_and(|v| v.contains("gzip"));
316
317 if is_gzip {
318 let bytes = resp
319 .into_body()
320 .with_config()
321 .limit(MAP_SITEMAP_MAX_BYTES)
322 .read_to_vec()
323 .ok()?;
324 let mut decoded = Vec::new();
325 flate2::read::GzDecoder::new(bytes.as_slice())
326 .take(MAP_SITEMAP_MAX_DECOMPRESSED)
327 .read_to_end(&mut decoded)
328 .ok()?;
329 if decoded.len() as u64 > bytes.len() as u64 * MAP_GZIP_MAX_RATIO {
330 return None;
331 }
332 if looks_like_html(&decoded) {
333 return None;
334 }
335 String::from_utf8(decoded).ok()
336 } else {
337 let body = resp
338 .into_body()
339 .with_config()
340 .limit(MAP_SITEMAP_MAX_BYTES)
341 .read_to_string()
342 .ok()?;
343 if looks_like_html(body.as_bytes()) {
344 return None;
345 }
346 Some(body)
347 }
348}
349
350fn looks_like_html(bytes: &[u8]) -> bool {
351 const DOCTYPE: &[u8] = b"<!doctype";
352 const HTML: &[u8] = b"<html";
353 const BOM: &[u8] = b"\xef\xbb\xbf";
354 let mut prefix = bytes.get(..HTML_SNIFF_LEN).unwrap_or(bytes);
355 if prefix.starts_with(BOM) {
356 prefix = &prefix[BOM.len()..];
357 }
358 let prefix = prefix
359 .iter()
360 .position(|b| !b.is_ascii_whitespace())
361 .map_or(&[][..], |i| &prefix[i..]);
362 prefix
363 .get(..DOCTYPE.len())
364 .is_some_and(|p| p.eq_ignore_ascii_case(DOCTYPE))
365 || prefix.get(..HTML.len()).is_some_and(|p| p.eq_ignore_ascii_case(HTML))
366}
367
368fn fetch_html(agent: &ureq::Agent, url: &Url) -> Option<String> {
369 let resp = fetch_following_redirects(agent, url, url)?;
370 resp.into_body()
371 .with_config()
372 .limit(MAP_HTML_MAX_BYTES)
373 .read_to_string()
374 .ok()
375}
376
377fn extract_links(html: &str, base: &Url) -> Vec<Url> {
378 dom_query::Document::from(html)
379 .select("a[href]")
380 .iter()
381 .filter_map(|el| {
382 let href = el.attr("href")?;
383 let href = href.trim();
384 if href.is_empty() {
385 return None;
386 }
387 let resolved = base.join(href).ok()?;
388 matches!(resolved.scheme(), "http" | "https").then_some(resolved)
389 })
390 .collect()
391}
392
393enum SitemapEntry {
394 Url { loc: String, lastmod: Option<String> },
395 Sitemap { loc: String },
396}
397
398fn parse_sitemap(body: &str) -> Vec<SitemapEntry> {
399 use quick_xml::events::Event;
400 use quick_xml::reader::Reader;
401
402 let mut reader = Reader::from_str(body);
403 let mut entries = Vec::new();
404 let mut buf = Vec::new();
405 let mut capture = Capture::Idle;
406 let mut loc = String::new();
407 let mut lastmod = String::new();
408 let mut in_url = false;
409 let mut in_sitemap = false;
410 let mut depth: u32 = 0;
411
412 loop {
413 match reader.read_event_into(&mut buf) {
414 Ok(Event::Start(e)) => {
415 let name = e.local_name();
416 match name.as_ref() {
417 b"url" => {
418 in_url = true;
419 depth = 0;
420 }
421 b"sitemap" => {
422 in_sitemap = true;
423 depth = 0;
424 }
425 b"loc" if (in_url || in_sitemap) && depth == 0 => capture = Capture::Loc,
426 b"lastmod" if in_url && depth == 0 => capture = Capture::Lastmod,
427 _ if in_url || in_sitemap => depth += 1,
428 _ => {}
429 }
430 }
431 Ok(Event::Text(e)) => {
432 if let Ok(text) = e.xml10_content() {
433 match capture {
434 Capture::Loc => loc.push_str(text.trim()),
435 Capture::Lastmod => lastmod.push_str(text.trim()),
436 Capture::Idle => {}
437 }
438 } else {
439 loc.clear();
440 lastmod.clear();
441 capture = Capture::Idle;
442 }
443 }
444 Ok(Event::GeneralRef(e)) => {
445 let resolved = match &*e {
446 b"amp" => "&",
447 b"lt" => "<",
448 b"gt" => ">",
449 b"quot" => "\"",
450 b"apos" => "'",
451 _ => "",
452 };
453 match capture {
454 Capture::Loc => loc.push_str(resolved),
455 Capture::Lastmod => lastmod.push_str(resolved),
456 Capture::Idle => {}
457 }
458 }
459 Ok(Event::End(e)) => {
460 let name = e.local_name();
461 match name.as_ref() {
462 b"url" if in_url => {
463 if !loc.is_empty() {
464 let lm = if lastmod.is_empty() {
465 None
466 } else {
467 Some(std::mem::take(&mut lastmod))
468 };
469 entries.push(SitemapEntry::Url {
470 loc: std::mem::take(&mut loc),
471 lastmod: lm,
472 });
473 }
474 loc.clear();
475 lastmod.clear();
476 in_url = false;
477 }
478 b"sitemap" if in_sitemap => {
479 if !loc.is_empty() {
480 entries.push(SitemapEntry::Sitemap {
481 loc: std::mem::take(&mut loc),
482 });
483 }
484 loc.clear();
485 lastmod.clear();
486 in_sitemap = false;
487 }
488 b"loc" | b"lastmod" if capture != Capture::Idle => capture = Capture::Idle,
489 _ if depth > 0 => depth -= 1,
490 _ => {}
491 }
492 }
493 Ok(Event::Eof) | Err(_) => break,
494 _ => {}
495 }
496 buf.clear();
497 }
498
499 entries
500}
501
502#[derive(Clone, Copy, Debug, PartialEq, Eq)]
503enum Capture {
504 Idle,
505 Loc,
506 Lastmod,
507}
508
509fn validate_entry(
510 loc: &str,
511 lastmod: Option<String>,
512 seed: &Url,
513 robots: &RobotsPolicy,
514 opts: &MapConfig,
515 visited: &mut HashSet<String>,
516) -> Option<MapEntry> {
517 if loc.len() > MAP_URL_MAX_LEN {
518 return None;
519 }
520 let url = Url::parse(loc)
521 .ok()
522 .filter(|u| matches!(u.scheme(), "http" | "https"))?;
523 if !is_same_site(seed, &url) {
524 return None;
525 }
526 if !robots.is_allowed(&url) {
527 return None;
528 }
529 if !matches_scope(&url, opts.include.as_ref(), opts.exclude.as_ref()) {
530 return None;
531 }
532 let normalized = normalize_url(&url);
533 if !visited.insert(normalized.clone()) {
534 return None;
535 }
536 Some(MapEntry {
537 url: normalized,
538 lastmod,
539 })
540}
541
542async fn throttle(last_fetch: &mut Instant) {
543 let elapsed = last_fetch.elapsed();
544 if elapsed < MAP_MIN_FETCH_INTERVAL {
545 tokio::time::sleep(MAP_MIN_FETCH_INTERVAL.saturating_sub(elapsed)).await;
546 }
547 *last_fetch = Instant::now();
548}
549
550#[cfg(test)]
551mod tests {
552 use super::*;
553
554 fn test_config(seed: &str) -> MapConfig {
555 MapConfig {
556 seed: Url::parse(seed).unwrap(),
557 limit: 100,
558 include: None,
559 exclude: None,
560 user_agent: None,
561 timeout: Duration::from_secs(30),
562 no_fallback: false,
563 }
564 }
565
566 #[test]
567 fn parse_urlset() {
568 let xml = r#"<?xml version="1.0"?>
569<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
570 <url><loc>https://example.com/a</loc><lastmod>2026-01-01</lastmod></url>
571 <url><loc>https://example.com/b</loc></url>
572</urlset>"#;
573 let entries = parse_sitemap(xml);
574 assert_eq!(entries.len(), 2);
575 match &entries[0] {
576 SitemapEntry::Url { loc, lastmod } => {
577 assert_eq!(loc, "https://example.com/a");
578 assert_eq!(lastmod.as_deref(), Some("2026-01-01"));
579 }
580 SitemapEntry::Sitemap { .. } => panic!("expected Url"),
581 }
582 match &entries[1] {
583 SitemapEntry::Url { loc, lastmod } => {
584 assert_eq!(loc, "https://example.com/b");
585 assert!(lastmod.is_none());
586 }
587 SitemapEntry::Sitemap { .. } => panic!("expected Url"),
588 }
589 }
590
591 #[test]
592 fn parse_sitemapindex() {
593 let xml = r#"<?xml version="1.0"?>
594<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
595 <sitemap><loc>https://example.com/sitemap1.xml</loc></sitemap>
596 <sitemap><loc>https://example.com/sitemap2.xml</loc></sitemap>
597</sitemapindex>"#;
598 let entries = parse_sitemap(xml);
599 assert_eq!(entries.len(), 2);
600 match &entries[0] {
601 SitemapEntry::Sitemap { loc } => assert_eq!(loc, "https://example.com/sitemap1.xml"),
602 SitemapEntry::Url { .. } => panic!("expected Sitemap"),
603 }
604 }
605
606 #[test]
607 fn parse_handles_xml_entities() {
608 let xml = r"<urlset><url><loc>https://example.com/a?b=1&c=2</loc></url></urlset>";
609 let entries = parse_sitemap(xml);
610 assert_eq!(entries.len(), 1);
611 match &entries[0] {
612 SitemapEntry::Url { loc, .. } => assert_eq!(loc, "https://example.com/a?b=1&c=2"),
613 SitemapEntry::Sitemap { .. } => panic!("expected Url"),
614 }
615 }
616
617 #[test]
618 fn parse_handles_namespaced_tags() {
619 let xml = r#"<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
620 xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">
621 <url>
622 <loc>https://example.com/page</loc>
623 <image:image><image:loc>https://example.com/img.png</image:loc></image:image>
624 </url>
625</urlset>"#;
626 let entries = parse_sitemap(xml);
627 assert_eq!(entries.len(), 1);
628 match &entries[0] {
629 SitemapEntry::Url { loc, .. } => assert_eq!(loc, "https://example.com/page"),
630 SitemapEntry::Sitemap { .. } => panic!("expected Url"),
631 }
632 }
633
634 #[test]
635 fn parse_loc_after_nested_extension() {
636 let xml = r#"<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
637 xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">
638 <url>
639 <image:image><image:loc>https://example.com/img.png</image:loc></image:image>
640 <loc>https://example.com/page</loc>
641 <lastmod>2026-01-01</lastmod>
642 </url>
643</urlset>"#;
644 let entries = parse_sitemap(xml);
645 assert_eq!(entries.len(), 1);
646 match &entries[0] {
647 SitemapEntry::Url { loc, lastmod } => {
648 assert_eq!(loc, "https://example.com/page");
649 assert_eq!(lastmod.as_deref(), Some("2026-01-01"));
650 }
651 SitemapEntry::Sitemap { .. } => panic!("expected Url"),
652 }
653 }
654
655 #[test]
656 fn parse_empty_body_returns_empty() {
657 assert!(parse_sitemap("").is_empty());
658 assert!(parse_sitemap("<html><body>Not Found</body></html>").is_empty());
659 }
660
661 #[test]
662 fn looks_like_html_detects_variants() {
663 assert!(looks_like_html(b"<!DOCTYPE html>"));
664 assert!(looks_like_html(b"<!doctype html>"));
665 assert!(looks_like_html(b"<html lang=\"en\">"));
666 assert!(looks_like_html(b"<HTML>"));
667 assert!(looks_like_html(b"\xef\xbb\xbf<!DOCTYPE html>"));
668 assert!(looks_like_html(b" \n<!doctype html>"));
669 assert!(looks_like_html(b"\xef\xbb\xbf <html>"));
670 assert!(!looks_like_html(b"<?xml version=\"1.0\"?>"));
671 assert!(!looks_like_html(b"<urlset>"));
672 assert!(!looks_like_html(b""));
673 }
674
675 #[test]
676 fn validate_entry_rejects_private_ip() {
677 let opts = test_config("https://example.com");
678
679 let mut visited = HashSet::new();
680 let result = validate_entry(
681 "http://127.0.0.1/secret",
682 None,
683 &opts.seed,
684 &RobotsPolicy::Unavailable,
685 &opts,
686 &mut visited,
687 );
688 assert!(result.is_none());
689 }
690
691 #[test]
692 fn validate_entry_rejects_cross_site() {
693 let opts = test_config("https://example.com");
694 let robots = RobotsPolicy::Unavailable;
695 let mut visited = HashSet::new();
696 let result = validate_entry("https://evil.com/page", None, &opts.seed, &robots, &opts, &mut visited);
697 assert!(result.is_none());
698 }
699
700 #[test]
701 fn validate_entry_deduplicates() {
702 let opts = test_config("https://example.com");
703 let robots = RobotsPolicy::Unavailable;
704 let mut visited = HashSet::new();
705 let first = validate_entry(
706 "https://example.com/page",
707 None,
708 &opts.seed,
709 &robots,
710 &opts,
711 &mut visited,
712 );
713 assert!(first.is_some());
714 let second = validate_entry(
715 "https://example.com/page",
716 None,
717 &opts.seed,
718 &robots,
719 &opts,
720 &mut visited,
721 );
722 assert!(second.is_none());
723 }
724
725 #[test]
726 fn validate_entry_rejects_long_url() {
727 let opts = test_config("https://example.com");
728 let robots = RobotsPolicy::Unavailable;
729 let mut visited = HashSet::new();
730 let long_url = format!("https://example.com/{}", "a".repeat(MAP_URL_MAX_LEN));
731 let result = validate_entry(&long_url, None, &opts.seed, &robots, &opts, &mut visited);
732 assert!(result.is_none());
733 }
734
735 #[test]
736 fn discover_sitemaps_includes_robots_and_default() {
737 let seed = Url::parse("https://example.com").unwrap();
738 let robots = RobotsPolicy::Rules(RobotsRules {
739 rules: Vec::new(),
740 sitemaps: vec![Url::parse("https://example.com/custom-sitemap.xml").unwrap()],
741 });
742 let sitemaps = discover_sitemaps(&robots, &seed);
743 assert_eq!(sitemaps.len(), 2);
744 assert_eq!(sitemaps[0].as_str(), "https://example.com/custom-sitemap.xml");
745 assert_eq!(sitemaps[1].as_str(), "https://example.com/sitemap.xml");
746 }
747
748 #[test]
749 fn discover_sitemaps_deduplicates_default() {
750 let seed = Url::parse("https://example.com").unwrap();
751 let robots = RobotsPolicy::Rules(RobotsRules {
752 rules: Vec::new(),
753 sitemaps: vec![Url::parse("https://example.com/sitemap.xml").unwrap()],
754 });
755 let sitemaps = discover_sitemaps(&robots, &seed);
756 assert_eq!(sitemaps.len(), 1);
757 }
758
759 mod integration {
760 use std::time::Duration;
761
762 use tokio::task::spawn_blocking;
763 use url::Url;
764 use wiremock::matchers::{method, path};
765 use wiremock::{Mock, MockServer, ResponseTemplate};
766
767 use crate::map::{
768 MapConfig, MapEntry, build_agent, extract_links, fetch_html, fetch_sitemap, parse_sitemap, run,
769 };
770
771 #[tokio::test]
772 async fn fetch_sitemap_parses_urlset() {
773 let server = MockServer::start().await;
774 let xml = r#"<?xml version="1.0"?><urlset><url><loc>https://example.com/a</loc></url></urlset>"#;
775 Mock::given(method("GET"))
776 .and(path("/sitemap.xml"))
777 .respond_with(ResponseTemplate::new(200).set_body_raw(xml.as_bytes().to_vec(), "application/xml"))
778 .mount(&server)
779 .await;
780
781 let agent = build_agent("test/1.0", Duration::from_secs(5));
782 let url = Url::parse(&format!("{}/sitemap.xml", server.uri())).unwrap();
783 let body = spawn_blocking(move || fetch_sitemap(&agent, &url, &url)).await.unwrap();
784
785 let entries = parse_sitemap(&body.unwrap());
786 assert_eq!(entries.len(), 1);
787 }
788
789 #[tokio::test]
790 async fn fetch_sitemap_rejects_html_error_page() {
791 let server = MockServer::start().await;
792 Mock::given(method("GET"))
793 .and(path("/sitemap.xml"))
794 .respond_with(ResponseTemplate::new(200).set_body_raw(
795 b"<!DOCTYPE html><html><body>Not Found</body></html>".to_vec(),
796 "text/html; charset=utf-8",
797 ))
798 .mount(&server)
799 .await;
800
801 let agent = build_agent("test/1.0", Duration::from_secs(5));
802 let url = Url::parse(&format!("{}/sitemap.xml", server.uri())).unwrap();
803 let body = spawn_blocking(move || fetch_sitemap(&agent, &url, &url)).await.unwrap();
804
805 assert!(body.is_none());
806 }
807
808 #[tokio::test]
809 async fn fetch_sitemap_returns_none_on_404() {
810 let server = MockServer::start().await;
811 Mock::given(method("GET"))
812 .and(path("/sitemap.xml"))
813 .respond_with(ResponseTemplate::new(404))
814 .mount(&server)
815 .await;
816
817 let agent = build_agent("test/1.0", Duration::from_secs(5));
818 let url = Url::parse(&format!("{}/sitemap.xml", server.uri())).unwrap();
819 let body = spawn_blocking(move || fetch_sitemap(&agent, &url, &url)).await.unwrap();
820
821 assert!(body.is_none());
822 }
823
824 #[tokio::test]
825 async fn fetch_sitemap_handles_gzip() {
826 use std::io::Write as _;
827
828 use flate2::Compression;
829 use flate2::write::GzEncoder;
830
831 let server = MockServer::start().await;
832 let xml = r#"<?xml version="1.0"?><urlset><url><loc>https://example.com/gz</loc></url></urlset>"#;
833 let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
834 encoder.write_all(xml.as_bytes()).unwrap();
835 let compressed = encoder.finish().unwrap();
836
837 Mock::given(method("GET"))
838 .and(path("/sitemap.xml.gz"))
839 .respond_with(ResponseTemplate::new(200).set_body_raw(compressed, "application/gzip"))
840 .mount(&server)
841 .await;
842
843 let agent = build_agent("test/1.0", Duration::from_secs(5));
844 let url = Url::parse(&format!("{}/sitemap.xml.gz", server.uri())).unwrap();
845 let body = spawn_blocking(move || fetch_sitemap(&agent, &url, &url)).await.unwrap();
846
847 let entries = parse_sitemap(&body.unwrap());
848 assert_eq!(entries.len(), 1);
849 }
850
851 #[tokio::test]
852 async fn fetch_html_extracts_links() {
853 let server = MockServer::start().await;
854 Mock::given(method("GET"))
855 .and(path("/"))
856 .respond_with(ResponseTemplate::new(200).set_body_raw(
857 br#"<html><body><a href="/link">x</a></body></html>"#.to_vec(),
858 "text/html; charset=utf-8",
859 ))
860 .mount(&server)
861 .await;
862
863 let agent = build_agent("test/1.0", Duration::from_secs(5));
864 let seed = Url::parse(&server.uri()).unwrap();
865 let html = spawn_blocking({
866 let seed = seed.clone();
867 move || fetch_html(&agent, &seed)
868 })
869 .await
870 .unwrap()
871 .unwrap();
872
873 let links = extract_links(&html, &seed);
874 assert_eq!(links.len(), 1);
875 }
876
877 async fn check_run(server: &MockServer, configure: impl FnOnce(&mut MapConfig)) -> Vec<MapEntry> {
878 let mut config = MapConfig {
879 seed: Url::parse(&server.uri()).unwrap(),
880 limit: 100,
881 include: None,
882 exclude: None,
883 user_agent: Some("test-bot".into()),
884 timeout: Duration::from_secs(5),
885 no_fallback: false,
886 };
887 configure(&mut config);
888 let mut entries = Vec::new();
889 run(&config, |e| {
890 entries.push(MapEntry {
891 url: e.url.clone(),
892 lastmod: e.lastmod.clone(),
893 });
894 })
895 .await;
896 entries
897 }
898
899 #[tokio::test]
900 async fn run_discovers_urls_from_sitemap() {
901 let server = MockServer::start().await;
902 Mock::given(method("GET"))
903 .and(path("/robots.txt"))
904 .respond_with(ResponseTemplate::new(200).set_body_string("User-agent: *\nAllow: /"))
905 .mount(&server)
906 .await;
907 let sitemap = format!(
908 "<urlset><url><loc>{}/page1</loc></url><url><loc>{}/page2</loc></url></urlset>",
909 server.uri(),
910 server.uri()
911 );
912 Mock::given(method("GET"))
913 .and(path("/sitemap.xml"))
914 .respond_with(ResponseTemplate::new(200).set_body_string(sitemap))
915 .mount(&server)
916 .await;
917
918 let entries = check_run(&server, |_| {}).await;
919 assert_eq!(entries.len(), 2);
920 assert!(entries.iter().any(|e| e.url.ends_with("/page1")));
921 assert!(entries.iter().any(|e| e.url.ends_with("/page2")));
922 }
923
924 #[tokio::test]
925 async fn run_respects_limit() {
926 let server = MockServer::start().await;
927 Mock::given(method("GET"))
928 .and(path("/robots.txt"))
929 .respond_with(ResponseTemplate::new(404))
930 .mount(&server)
931 .await;
932 let sitemap = format!(
933 "<urlset><url><loc>{}/a</loc></url><url><loc>{}/b</loc></url><url><loc>{}/c</loc></url></urlset>",
934 server.uri(),
935 server.uri(),
936 server.uri()
937 );
938 Mock::given(method("GET"))
939 .and(path("/sitemap.xml"))
940 .respond_with(ResponseTemplate::new(200).set_body_string(sitemap))
941 .mount(&server)
942 .await;
943
944 let entries = check_run(&server, |c| c.limit = 2).await;
945 assert_eq!(entries.len(), 2);
946 }
947
948 #[tokio::test]
949 async fn run_follows_sitemap_index() {
950 let server = MockServer::start().await;
951 Mock::given(method("GET"))
952 .and(path("/robots.txt"))
953 .respond_with(ResponseTemplate::new(404))
954 .mount(&server)
955 .await;
956 let index = format!(
957 "<sitemapindex><sitemap><loc>{}/sub.xml</loc></sitemap></sitemapindex>",
958 server.uri()
959 );
960 Mock::given(method("GET"))
961 .and(path("/sitemap.xml"))
962 .respond_with(ResponseTemplate::new(200).set_body_string(index))
963 .mount(&server)
964 .await;
965 let sub = format!("<urlset><url><loc>{}/deep</loc></url></urlset>", server.uri());
966 Mock::given(method("GET"))
967 .and(path("/sub.xml"))
968 .respond_with(ResponseTemplate::new(200).set_body_string(sub))
969 .mount(&server)
970 .await;
971
972 let entries = check_run(&server, |_| {}).await;
973 assert_eq!(entries.len(), 1);
974 assert!(entries[0].url.ends_with("/deep"));
975 }
976
977 #[tokio::test]
978 async fn run_falls_back_to_html_links() {
979 let server = MockServer::start().await;
980 Mock::given(method("GET"))
981 .and(path("/robots.txt"))
982 .respond_with(ResponseTemplate::new(404))
983 .mount(&server)
984 .await;
985 Mock::given(method("GET"))
986 .and(path("/sitemap.xml"))
987 .respond_with(ResponseTemplate::new(404))
988 .mount(&server)
989 .await;
990 let html = format!(
991 r#"<html><body><a href="{}/link1">L1</a><a href="{}/link2">L2</a></body></html>"#,
992 server.uri(),
993 server.uri()
994 );
995 Mock::given(method("GET"))
996 .and(path("/"))
997 .respond_with(ResponseTemplate::new(200).set_body_string(html))
998 .mount(&server)
999 .await;
1000
1001 let entries = check_run(&server, |_| {}).await;
1002 assert_eq!(entries.len(), 2);
1003 }
1004
1005 #[tokio::test]
1006 async fn run_no_fallback_skips_html() {
1007 let server = MockServer::start().await;
1008 Mock::given(method("GET"))
1009 .and(path("/robots.txt"))
1010 .respond_with(ResponseTemplate::new(404))
1011 .mount(&server)
1012 .await;
1013 Mock::given(method("GET"))
1014 .and(path("/sitemap.xml"))
1015 .respond_with(ResponseTemplate::new(404))
1016 .mount(&server)
1017 .await;
1018 Mock::given(method("GET"))
1019 .and(path("/"))
1020 .respond_with(
1021 ResponseTemplate::new(200).set_body_string(r#"<html><body><a href="/link">L</a></body></html>"#),
1022 )
1023 .mount(&server)
1024 .await;
1025
1026 let entries = check_run(&server, |c| c.no_fallback = true).await;
1027 assert_eq!(entries.len(), 0);
1028 }
1029
1030 #[tokio::test]
1031 async fn run_deduplicates_urls() {
1032 let server = MockServer::start().await;
1033 Mock::given(method("GET"))
1034 .and(path("/robots.txt"))
1035 .respond_with(ResponseTemplate::new(404))
1036 .mount(&server)
1037 .await;
1038 let sitemap = format!(
1039 "<urlset><url><loc>{}/dup</loc></url><url><loc>{}/dup</loc></url><url><loc>{}/unique</loc></url></urlset>",
1040 server.uri(),
1041 server.uri(),
1042 server.uri()
1043 );
1044 Mock::given(method("GET"))
1045 .and(path("/sitemap.xml"))
1046 .respond_with(ResponseTemplate::new(200).set_body_string(sitemap))
1047 .mount(&server)
1048 .await;
1049
1050 let entries = check_run(&server, |_| {}).await;
1051 assert_eq!(entries.len(), 2);
1052 }
1053 }
1054}