Skip to main content

servo_fetch/
map.rs

1//! URL discovery via sitemap parsing — no rendering required.
2
3use std::collections::{HashSet, VecDeque};
4use std::io::Read as _;
5use std::time::{Duration, Instant};
6
7use tokio::task::spawn_blocking;
8use url::Url;
9
10use crate::robots::{RobotsPolicy, RobotsRules};
11use crate::scope::{is_same_site, matches_scope, normalize_url};
12use crate::{bridge, net};
13
14const MAP_SITEMAP_MAX_BYTES: u64 = 50 * 1024 * 1024;
15const MAP_SITEMAP_MAX_DECOMPRESSED: u64 = 10 * 1024 * 1024;
16const MAP_GZIP_MAX_RATIO: u64 = 100;
17const MAP_HTML_MAX_BYTES: u64 = 2 * 1024 * 1024;
18const MAP_MAX_REDIRECTS: u8 = 5;
19const MAP_MAX_SITEMAPS: usize = 200;
20const MAP_MAX_INDEX_DEPTH: u8 = 5;
21const MAP_MIN_FETCH_INTERVAL: Duration = Duration::from_millis(500);
22const MAP_URL_MAX_LEN: usize = 2048;
23const HTML_SNIFF_LEN: usize = 100;
24
25/// Options for URL discovery (sitemap + link extraction, no rendering).
26#[must_use = "options do nothing until passed to map()"]
27#[derive(Debug, Clone)]
28pub struct MapOptions {
29    url: String,
30    limit: usize,
31    include: Vec<String>,
32    exclude: Vec<String>,
33    user_agent: Option<String>,
34    timeout: u64,
35    no_fallback: bool,
36}
37
38impl MapOptions {
39    /// Create map options for the given URL.
40    pub fn new(url: impl Into<String>) -> Self {
41        Self {
42            url: url.into(),
43            limit: 5000,
44            include: Vec::new(),
45            exclude: Vec::new(),
46            user_agent: None,
47            timeout: 30,
48            no_fallback: false,
49        }
50    }
51
52    /// Maximum number of URLs to discover.
53    pub fn limit(mut self, n: usize) -> Self {
54        self.limit = n;
55        self
56    }
57
58    /// URL path glob patterns to include.
59    pub fn include(mut self, patterns: &[&str]) -> Self {
60        self.include = patterns.iter().map(|s| (*s).to_string()).collect();
61        self
62    }
63
64    /// URL path glob patterns to exclude.
65    pub fn exclude(mut self, patterns: &[&str]) -> Self {
66        self.exclude = patterns.iter().map(|s| (*s).to_string()).collect();
67        self
68    }
69
70    /// Override the User-Agent string.
71    pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
72        self.user_agent = Some(ua.into());
73        self
74    }
75
76    /// Timeout in seconds per HTTP request.
77    pub fn timeout(mut self, secs: u64) -> Self {
78        self.timeout = secs;
79        self
80    }
81
82    /// Skip HTML link fallback if no sitemap is found.
83    pub fn no_fallback(mut self, yes: bool) -> Self {
84        self.no_fallback = yes;
85        self
86    }
87}
88
89/// A discovered URL from sitemap or link extraction.
90#[derive(Debug, Clone, serde::Serialize)]
91pub struct MappedUrl {
92    /// The discovered URL.
93    pub url: String,
94    /// Last modification date from sitemap, if available.
95    #[serde(skip_serializing_if = "Option::is_none")]
96    pub lastmod: Option<String>,
97}
98
99/// Discover URLs on a site via sitemaps and link extraction (no rendering).
100#[allow(clippy::needless_pass_by_value)]
101pub fn map(opts: MapOptions) -> crate::error::Result<Vec<MappedUrl>> {
102    net::ensure_crypto_provider();
103    let seed = net::validate_url(&opts.url)?;
104
105    let include = if opts.include.is_empty() {
106        None
107    } else {
108        Some(crate::scope::build_globset(&opts.include)?)
109    };
110    let exclude = if opts.exclude.is_empty() {
111        None
112    } else {
113        Some(crate::scope::build_globset(&opts.exclude)?)
114    };
115
116    let internal = MapConfig {
117        seed,
118        limit: opts.limit,
119        include,
120        exclude,
121        user_agent: opts.user_agent,
122        timeout: Duration::from_secs(opts.timeout),
123        no_fallback: opts.no_fallback,
124    };
125
126    let mut results = Vec::new();
127    crate::runtime::block_on(run(&internal, |entry| {
128        results.push(MappedUrl {
129            url: entry.url.clone(),
130            lastmod: entry.lastmod.clone(),
131        });
132    }))
133    .map_err(|e| crate::error::Error::Engine(e.to_string()))?;
134    Ok(results)
135}
136
137/// Options for URL discovery.
138pub(crate) struct MapConfig {
139    pub seed: Url,
140    pub limit: usize,
141    pub include: Option<globset::GlobSet>,
142    pub exclude: Option<globset::GlobSet>,
143    pub user_agent: Option<String>,
144    pub timeout: Duration,
145    pub no_fallback: bool,
146}
147
148/// A discovered URL with optional metadata.
149#[derive(serde::Serialize)]
150pub(crate) struct MapEntry {
151    pub url: String,
152    #[serde(skip_serializing_if = "Option::is_none")]
153    pub lastmod: Option<String>,
154}
155
156/// Run URL discovery for a site.
157pub(crate) async fn run(opts: &MapConfig, mut on_url: impl FnMut(&MapEntry)) {
158    let ua = opts
159        .user_agent
160        .as_deref()
161        .unwrap_or_else(|| bridge::default_user_agent());
162    let agent = build_agent(ua, opts.timeout);
163
164    let robots = {
165        let seed = opts.seed.clone();
166        let user_agent = opts.user_agent.clone();
167        let timeout = opts.timeout;
168        spawn_blocking(move || RobotsRules::fetch(&seed, user_agent.as_deref(), timeout))
169            .await
170            .unwrap_or(RobotsPolicy::Unreachable)
171    };
172
173    let mut visited = HashSet::new();
174    let mut count = 0;
175    let mut last_fetch = Instant::now()
176        .checked_sub(MAP_MIN_FETCH_INTERVAL)
177        .unwrap_or_else(Instant::now);
178    let mut sitemap_queue: VecDeque<(Url, u8)> = discover_sitemaps(&robots, &opts.seed)
179        .into_iter()
180        .map(|u| (u, 0))
181        .collect();
182    let mut sitemaps_fetched = 0;
183
184    while let Some((sitemap_url, depth)) = sitemap_queue.pop_front() {
185        if sitemaps_fetched >= MAP_MAX_SITEMAPS || count >= opts.limit {
186            break;
187        }
188        if depth > MAP_MAX_INDEX_DEPTH || !is_same_site(&opts.seed, &sitemap_url) {
189            continue;
190        }
191
192        throttle(&mut last_fetch).await;
193        sitemaps_fetched += 1;
194
195        let body = {
196            let agent = agent.clone();
197            spawn_blocking({
198                let seed = opts.seed.clone();
199                move || fetch_sitemap(&agent, &sitemap_url, &seed)
200            })
201            .await
202            .ok()
203            .flatten()
204        };
205        let Some(body) = body else { continue };
206
207        for entry in parse_sitemap(&body) {
208            match entry {
209                SitemapEntry::Url { loc, lastmod } => {
210                    if count >= opts.limit {
211                        break;
212                    }
213                    if let Some(e) = validate_entry(&loc, lastmod, &opts.seed, &robots, opts, &mut visited) {
214                        on_url(&e);
215                        count += 1;
216                    }
217                }
218                SitemapEntry::Sitemap { loc } => {
219                    if let Ok(url) = Url::parse(&loc) {
220                        sitemap_queue.push_back((url, depth + 1));
221                    }
222                }
223            }
224        }
225    }
226
227    if count == 0 && !opts.no_fallback {
228        throttle(&mut last_fetch).await;
229        let html = {
230            let agent = agent.clone();
231            let seed = opts.seed.clone();
232            spawn_blocking(move || fetch_html(&agent, &seed)).await.ok().flatten()
233        };
234        if let Some(html) = html {
235            for link in extract_links(&html, &opts.seed) {
236                if count >= opts.limit {
237                    break;
238                }
239                if let Some(e) = validate_entry(link.as_str(), None, &opts.seed, &robots, opts, &mut visited) {
240                    on_url(&e);
241                    count += 1;
242                }
243            }
244        }
245    }
246}
247
248fn discover_sitemaps(robots: &RobotsPolicy, seed: &Url) -> Vec<Url> {
249    let mut urls = Vec::new();
250    if let RobotsPolicy::Rules(rules) = robots {
251        urls.extend(rules.sitemaps.iter().cloned());
252    }
253    if let Ok(default) = seed.join("/sitemap.xml") {
254        if !urls.contains(&default) {
255            urls.push(default);
256        }
257    }
258    urls
259}
260
261fn build_agent(ua: &str, timeout: Duration) -> ureq::Agent {
262    ureq::Agent::new_with_config(
263        ureq::config::Config::builder()
264            .max_redirects(0)
265            .http_status_as_error(false)
266            .timeout_global(Some(timeout))
267            .user_agent(ua)
268            .build(),
269    )
270}
271
272fn fetch_following_redirects(agent: &ureq::Agent, url: &Url, seed: &Url) -> Option<ureq::http::Response<ureq::Body>> {
273    let mut current = url.clone();
274    for _ in 0..MAP_MAX_REDIRECTS {
275        let resp = agent.get(current.as_str()).call().ok()?;
276        let status = resp.status().as_u16();
277        if matches!(status, 301 | 302 | 303 | 307 | 308) {
278            let location = resp.headers().get("location")?.to_str().ok()?;
279            let next = current.join(location).ok()?;
280            if net::validate_url_with_policy(next.as_str(), bridge::engine_policy()).is_err()
281                || !is_same_site(seed, &next)
282            {
283                return None;
284            }
285            current = next;
286            continue;
287        }
288        if status >= 400 {
289            return None;
290        }
291        return Some(resp);
292    }
293    None
294}
295
296fn fetch_sitemap(agent: &ureq::Agent, url: &Url, seed: &Url) -> Option<String> {
297    let resp = fetch_following_redirects(agent, url, seed)?;
298    let content_type = resp
299        .headers()
300        .get("content-type")
301        .and_then(|v| v.to_str().ok())
302        .unwrap_or("");
303
304    let is_gzip = url
305        .path()
306        .rsplit('/')
307        .next()
308        .and_then(|seg| std::path::Path::new(seg).extension())
309        .is_some_and(|ext| ext.eq_ignore_ascii_case("gz"))
310        || content_type.contains("gzip")
311        || resp
312            .headers()
313            .get("content-encoding")
314            .and_then(|v| v.to_str().ok())
315            .is_some_and(|v| v.contains("gzip"));
316
317    if is_gzip {
318        let bytes = resp
319            .into_body()
320            .with_config()
321            .limit(MAP_SITEMAP_MAX_BYTES)
322            .read_to_vec()
323            .ok()?;
324        let mut decoded = Vec::new();
325        flate2::read::GzDecoder::new(bytes.as_slice())
326            .take(MAP_SITEMAP_MAX_DECOMPRESSED)
327            .read_to_end(&mut decoded)
328            .ok()?;
329        if decoded.len() as u64 > bytes.len() as u64 * MAP_GZIP_MAX_RATIO {
330            return None;
331        }
332        if looks_like_html(&decoded) {
333            return None;
334        }
335        String::from_utf8(decoded).ok()
336    } else {
337        let body = resp
338            .into_body()
339            .with_config()
340            .limit(MAP_SITEMAP_MAX_BYTES)
341            .read_to_string()
342            .ok()?;
343        if looks_like_html(body.as_bytes()) {
344            return None;
345        }
346        Some(body)
347    }
348}
349
350fn looks_like_html(bytes: &[u8]) -> bool {
351    const DOCTYPE: &[u8] = b"<!doctype";
352    const HTML: &[u8] = b"<html";
353    const BOM: &[u8] = b"\xef\xbb\xbf";
354    let mut prefix = bytes.get(..HTML_SNIFF_LEN).unwrap_or(bytes);
355    if prefix.starts_with(BOM) {
356        prefix = &prefix[BOM.len()..];
357    }
358    let prefix = prefix
359        .iter()
360        .position(|b| !b.is_ascii_whitespace())
361        .map_or(&[][..], |i| &prefix[i..]);
362    prefix
363        .get(..DOCTYPE.len())
364        .is_some_and(|p| p.eq_ignore_ascii_case(DOCTYPE))
365        || prefix.get(..HTML.len()).is_some_and(|p| p.eq_ignore_ascii_case(HTML))
366}
367
368fn fetch_html(agent: &ureq::Agent, url: &Url) -> Option<String> {
369    let resp = fetch_following_redirects(agent, url, url)?;
370    resp.into_body()
371        .with_config()
372        .limit(MAP_HTML_MAX_BYTES)
373        .read_to_string()
374        .ok()
375}
376
377fn extract_links(html: &str, base: &Url) -> Vec<Url> {
378    dom_query::Document::from(html)
379        .select("a[href]")
380        .iter()
381        .filter_map(|el| {
382            let href = el.attr("href")?;
383            let href = href.trim();
384            if href.is_empty() {
385                return None;
386            }
387            let resolved = base.join(href).ok()?;
388            matches!(resolved.scheme(), "http" | "https").then_some(resolved)
389        })
390        .collect()
391}
392
393enum SitemapEntry {
394    Url { loc: String, lastmod: Option<String> },
395    Sitemap { loc: String },
396}
397
398fn parse_sitemap(body: &str) -> Vec<SitemapEntry> {
399    use quick_xml::events::Event;
400    use quick_xml::reader::Reader;
401
402    let mut reader = Reader::from_str(body);
403    let mut entries = Vec::new();
404    let mut buf = Vec::new();
405    let mut capture = Capture::Idle;
406    let mut loc = String::new();
407    let mut lastmod = String::new();
408    let mut in_url = false;
409    let mut in_sitemap = false;
410    let mut depth: u32 = 0;
411
412    loop {
413        match reader.read_event_into(&mut buf) {
414            Ok(Event::Start(e)) => {
415                let name = e.local_name();
416                match name.as_ref() {
417                    b"url" => {
418                        in_url = true;
419                        depth = 0;
420                    }
421                    b"sitemap" => {
422                        in_sitemap = true;
423                        depth = 0;
424                    }
425                    b"loc" if (in_url || in_sitemap) && depth == 0 => capture = Capture::Loc,
426                    b"lastmod" if in_url && depth == 0 => capture = Capture::Lastmod,
427                    _ if in_url || in_sitemap => depth += 1,
428                    _ => {}
429                }
430            }
431            Ok(Event::Text(e)) => {
432                if let Ok(text) = e.xml10_content() {
433                    match capture {
434                        Capture::Loc => loc.push_str(text.trim()),
435                        Capture::Lastmod => lastmod.push_str(text.trim()),
436                        Capture::Idle => {}
437                    }
438                } else {
439                    loc.clear();
440                    lastmod.clear();
441                    capture = Capture::Idle;
442                }
443            }
444            Ok(Event::GeneralRef(e)) => {
445                let resolved = match &*e {
446                    b"amp" => "&",
447                    b"lt" => "<",
448                    b"gt" => ">",
449                    b"quot" => "\"",
450                    b"apos" => "'",
451                    _ => "",
452                };
453                match capture {
454                    Capture::Loc => loc.push_str(resolved),
455                    Capture::Lastmod => lastmod.push_str(resolved),
456                    Capture::Idle => {}
457                }
458            }
459            Ok(Event::End(e)) => {
460                let name = e.local_name();
461                match name.as_ref() {
462                    b"url" if in_url => {
463                        if !loc.is_empty() {
464                            let lm = if lastmod.is_empty() {
465                                None
466                            } else {
467                                Some(std::mem::take(&mut lastmod))
468                            };
469                            entries.push(SitemapEntry::Url {
470                                loc: std::mem::take(&mut loc),
471                                lastmod: lm,
472                            });
473                        }
474                        loc.clear();
475                        lastmod.clear();
476                        in_url = false;
477                    }
478                    b"sitemap" if in_sitemap => {
479                        if !loc.is_empty() {
480                            entries.push(SitemapEntry::Sitemap {
481                                loc: std::mem::take(&mut loc),
482                            });
483                        }
484                        loc.clear();
485                        lastmod.clear();
486                        in_sitemap = false;
487                    }
488                    b"loc" | b"lastmod" if capture != Capture::Idle => capture = Capture::Idle,
489                    _ if depth > 0 => depth -= 1,
490                    _ => {}
491                }
492            }
493            Ok(Event::Eof) | Err(_) => break,
494            _ => {}
495        }
496        buf.clear();
497    }
498
499    entries
500}
501
502#[derive(Clone, Copy, Debug, PartialEq, Eq)]
503enum Capture {
504    Idle,
505    Loc,
506    Lastmod,
507}
508
509fn validate_entry(
510    loc: &str,
511    lastmod: Option<String>,
512    seed: &Url,
513    robots: &RobotsPolicy,
514    opts: &MapConfig,
515    visited: &mut HashSet<String>,
516) -> Option<MapEntry> {
517    if loc.len() > MAP_URL_MAX_LEN {
518        return None;
519    }
520    let url = Url::parse(loc)
521        .ok()
522        .filter(|u| matches!(u.scheme(), "http" | "https"))?;
523    if !is_same_site(seed, &url) {
524        return None;
525    }
526    if !robots.is_allowed(&url) {
527        return None;
528    }
529    if !matches_scope(&url, opts.include.as_ref(), opts.exclude.as_ref()) {
530        return None;
531    }
532    let normalized = normalize_url(&url);
533    if !visited.insert(normalized.clone()) {
534        return None;
535    }
536    Some(MapEntry {
537        url: normalized,
538        lastmod,
539    })
540}
541
542async fn throttle(last_fetch: &mut Instant) {
543    let elapsed = last_fetch.elapsed();
544    if elapsed < MAP_MIN_FETCH_INTERVAL {
545        tokio::time::sleep(MAP_MIN_FETCH_INTERVAL.saturating_sub(elapsed)).await;
546    }
547    *last_fetch = Instant::now();
548}
549
550#[cfg(test)]
551mod tests {
552    use super::*;
553
554    fn test_config(seed: &str) -> MapConfig {
555        MapConfig {
556            seed: Url::parse(seed).unwrap(),
557            limit: 100,
558            include: None,
559            exclude: None,
560            user_agent: None,
561            timeout: Duration::from_secs(30),
562            no_fallback: false,
563        }
564    }
565
566    #[test]
567    fn parse_urlset() {
568        let xml = r#"<?xml version="1.0"?>
569<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
570  <url><loc>https://example.com/a</loc><lastmod>2026-01-01</lastmod></url>
571  <url><loc>https://example.com/b</loc></url>
572</urlset>"#;
573        let entries = parse_sitemap(xml);
574        assert_eq!(entries.len(), 2);
575        match &entries[0] {
576            SitemapEntry::Url { loc, lastmod } => {
577                assert_eq!(loc, "https://example.com/a");
578                assert_eq!(lastmod.as_deref(), Some("2026-01-01"));
579            }
580            SitemapEntry::Sitemap { .. } => panic!("expected Url"),
581        }
582        match &entries[1] {
583            SitemapEntry::Url { loc, lastmod } => {
584                assert_eq!(loc, "https://example.com/b");
585                assert!(lastmod.is_none());
586            }
587            SitemapEntry::Sitemap { .. } => panic!("expected Url"),
588        }
589    }
590
591    #[test]
592    fn parse_sitemapindex() {
593        let xml = r#"<?xml version="1.0"?>
594<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
595  <sitemap><loc>https://example.com/sitemap1.xml</loc></sitemap>
596  <sitemap><loc>https://example.com/sitemap2.xml</loc></sitemap>
597</sitemapindex>"#;
598        let entries = parse_sitemap(xml);
599        assert_eq!(entries.len(), 2);
600        match &entries[0] {
601            SitemapEntry::Sitemap { loc } => assert_eq!(loc, "https://example.com/sitemap1.xml"),
602            SitemapEntry::Url { .. } => panic!("expected Sitemap"),
603        }
604    }
605
606    #[test]
607    fn parse_handles_xml_entities() {
608        let xml = r"<urlset><url><loc>https://example.com/a?b=1&amp;c=2</loc></url></urlset>";
609        let entries = parse_sitemap(xml);
610        assert_eq!(entries.len(), 1);
611        match &entries[0] {
612            SitemapEntry::Url { loc, .. } => assert_eq!(loc, "https://example.com/a?b=1&c=2"),
613            SitemapEntry::Sitemap { .. } => panic!("expected Url"),
614        }
615    }
616
617    #[test]
618    fn parse_handles_namespaced_tags() {
619        let xml = r#"<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
620                xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">
621  <url>
622    <loc>https://example.com/page</loc>
623    <image:image><image:loc>https://example.com/img.png</image:loc></image:image>
624  </url>
625</urlset>"#;
626        let entries = parse_sitemap(xml);
627        assert_eq!(entries.len(), 1);
628        match &entries[0] {
629            SitemapEntry::Url { loc, .. } => assert_eq!(loc, "https://example.com/page"),
630            SitemapEntry::Sitemap { .. } => panic!("expected Url"),
631        }
632    }
633
634    #[test]
635    fn parse_loc_after_nested_extension() {
636        let xml = r#"<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
637                xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">
638  <url>
639    <image:image><image:loc>https://example.com/img.png</image:loc></image:image>
640    <loc>https://example.com/page</loc>
641    <lastmod>2026-01-01</lastmod>
642  </url>
643</urlset>"#;
644        let entries = parse_sitemap(xml);
645        assert_eq!(entries.len(), 1);
646        match &entries[0] {
647            SitemapEntry::Url { loc, lastmod } => {
648                assert_eq!(loc, "https://example.com/page");
649                assert_eq!(lastmod.as_deref(), Some("2026-01-01"));
650            }
651            SitemapEntry::Sitemap { .. } => panic!("expected Url"),
652        }
653    }
654
655    #[test]
656    fn parse_empty_body_returns_empty() {
657        assert!(parse_sitemap("").is_empty());
658        assert!(parse_sitemap("<html><body>Not Found</body></html>").is_empty());
659    }
660
661    #[test]
662    fn looks_like_html_detects_variants() {
663        assert!(looks_like_html(b"<!DOCTYPE html>"));
664        assert!(looks_like_html(b"<!doctype html>"));
665        assert!(looks_like_html(b"<html lang=\"en\">"));
666        assert!(looks_like_html(b"<HTML>"));
667        assert!(looks_like_html(b"\xef\xbb\xbf<!DOCTYPE html>"));
668        assert!(looks_like_html(b"  \n<!doctype html>"));
669        assert!(looks_like_html(b"\xef\xbb\xbf  <html>"));
670        assert!(!looks_like_html(b"<?xml version=\"1.0\"?>"));
671        assert!(!looks_like_html(b"<urlset>"));
672        assert!(!looks_like_html(b""));
673    }
674
675    #[test]
676    fn validate_entry_rejects_private_ip() {
677        let opts = test_config("https://example.com");
678
679        let mut visited = HashSet::new();
680        let result = validate_entry(
681            "http://127.0.0.1/secret",
682            None,
683            &opts.seed,
684            &RobotsPolicy::Unavailable,
685            &opts,
686            &mut visited,
687        );
688        assert!(result.is_none());
689    }
690
691    #[test]
692    fn validate_entry_rejects_cross_site() {
693        let opts = test_config("https://example.com");
694        let robots = RobotsPolicy::Unavailable;
695        let mut visited = HashSet::new();
696        let result = validate_entry("https://evil.com/page", None, &opts.seed, &robots, &opts, &mut visited);
697        assert!(result.is_none());
698    }
699
700    #[test]
701    fn validate_entry_deduplicates() {
702        let opts = test_config("https://example.com");
703        let robots = RobotsPolicy::Unavailable;
704        let mut visited = HashSet::new();
705        let first = validate_entry(
706            "https://example.com/page",
707            None,
708            &opts.seed,
709            &robots,
710            &opts,
711            &mut visited,
712        );
713        assert!(first.is_some());
714        let second = validate_entry(
715            "https://example.com/page",
716            None,
717            &opts.seed,
718            &robots,
719            &opts,
720            &mut visited,
721        );
722        assert!(second.is_none());
723    }
724
725    #[test]
726    fn validate_entry_rejects_long_url() {
727        let opts = test_config("https://example.com");
728        let robots = RobotsPolicy::Unavailable;
729        let mut visited = HashSet::new();
730        let long_url = format!("https://example.com/{}", "a".repeat(MAP_URL_MAX_LEN));
731        let result = validate_entry(&long_url, None, &opts.seed, &robots, &opts, &mut visited);
732        assert!(result.is_none());
733    }
734
735    #[test]
736    fn discover_sitemaps_includes_robots_and_default() {
737        let seed = Url::parse("https://example.com").unwrap();
738        let robots = RobotsPolicy::Rules(RobotsRules {
739            rules: Vec::new(),
740            sitemaps: vec![Url::parse("https://example.com/custom-sitemap.xml").unwrap()],
741        });
742        let sitemaps = discover_sitemaps(&robots, &seed);
743        assert_eq!(sitemaps.len(), 2);
744        assert_eq!(sitemaps[0].as_str(), "https://example.com/custom-sitemap.xml");
745        assert_eq!(sitemaps[1].as_str(), "https://example.com/sitemap.xml");
746    }
747
748    #[test]
749    fn discover_sitemaps_deduplicates_default() {
750        let seed = Url::parse("https://example.com").unwrap();
751        let robots = RobotsPolicy::Rules(RobotsRules {
752            rules: Vec::new(),
753            sitemaps: vec![Url::parse("https://example.com/sitemap.xml").unwrap()],
754        });
755        let sitemaps = discover_sitemaps(&robots, &seed);
756        assert_eq!(sitemaps.len(), 1);
757    }
758
759    mod integration {
760        use std::time::Duration;
761
762        use tokio::task::spawn_blocking;
763        use url::Url;
764        use wiremock::matchers::{method, path};
765        use wiremock::{Mock, MockServer, ResponseTemplate};
766
767        use crate::map::{
768            MapConfig, MapEntry, build_agent, extract_links, fetch_html, fetch_sitemap, parse_sitemap, run,
769        };
770
771        #[tokio::test]
772        async fn fetch_sitemap_parses_urlset() {
773            let server = MockServer::start().await;
774            let xml = r#"<?xml version="1.0"?><urlset><url><loc>https://example.com/a</loc></url></urlset>"#;
775            Mock::given(method("GET"))
776                .and(path("/sitemap.xml"))
777                .respond_with(ResponseTemplate::new(200).set_body_raw(xml.as_bytes().to_vec(), "application/xml"))
778                .mount(&server)
779                .await;
780
781            let agent = build_agent("test/1.0", Duration::from_secs(5));
782            let url = Url::parse(&format!("{}/sitemap.xml", server.uri())).unwrap();
783            let body = spawn_blocking(move || fetch_sitemap(&agent, &url, &url)).await.unwrap();
784
785            let entries = parse_sitemap(&body.unwrap());
786            assert_eq!(entries.len(), 1);
787        }
788
789        #[tokio::test]
790        async fn fetch_sitemap_rejects_html_error_page() {
791            let server = MockServer::start().await;
792            Mock::given(method("GET"))
793                .and(path("/sitemap.xml"))
794                .respond_with(ResponseTemplate::new(200).set_body_raw(
795                    b"<!DOCTYPE html><html><body>Not Found</body></html>".to_vec(),
796                    "text/html; charset=utf-8",
797                ))
798                .mount(&server)
799                .await;
800
801            let agent = build_agent("test/1.0", Duration::from_secs(5));
802            let url = Url::parse(&format!("{}/sitemap.xml", server.uri())).unwrap();
803            let body = spawn_blocking(move || fetch_sitemap(&agent, &url, &url)).await.unwrap();
804
805            assert!(body.is_none());
806        }
807
808        #[tokio::test]
809        async fn fetch_sitemap_returns_none_on_404() {
810            let server = MockServer::start().await;
811            Mock::given(method("GET"))
812                .and(path("/sitemap.xml"))
813                .respond_with(ResponseTemplate::new(404))
814                .mount(&server)
815                .await;
816
817            let agent = build_agent("test/1.0", Duration::from_secs(5));
818            let url = Url::parse(&format!("{}/sitemap.xml", server.uri())).unwrap();
819            let body = spawn_blocking(move || fetch_sitemap(&agent, &url, &url)).await.unwrap();
820
821            assert!(body.is_none());
822        }
823
824        #[tokio::test]
825        async fn fetch_sitemap_handles_gzip() {
826            use std::io::Write as _;
827
828            use flate2::Compression;
829            use flate2::write::GzEncoder;
830
831            let server = MockServer::start().await;
832            let xml = r#"<?xml version="1.0"?><urlset><url><loc>https://example.com/gz</loc></url></urlset>"#;
833            let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
834            encoder.write_all(xml.as_bytes()).unwrap();
835            let compressed = encoder.finish().unwrap();
836
837            Mock::given(method("GET"))
838                .and(path("/sitemap.xml.gz"))
839                .respond_with(ResponseTemplate::new(200).set_body_raw(compressed, "application/gzip"))
840                .mount(&server)
841                .await;
842
843            let agent = build_agent("test/1.0", Duration::from_secs(5));
844            let url = Url::parse(&format!("{}/sitemap.xml.gz", server.uri())).unwrap();
845            let body = spawn_blocking(move || fetch_sitemap(&agent, &url, &url)).await.unwrap();
846
847            let entries = parse_sitemap(&body.unwrap());
848            assert_eq!(entries.len(), 1);
849        }
850
851        #[tokio::test]
852        async fn fetch_html_extracts_links() {
853            let server = MockServer::start().await;
854            Mock::given(method("GET"))
855                .and(path("/"))
856                .respond_with(ResponseTemplate::new(200).set_body_raw(
857                    br#"<html><body><a href="/link">x</a></body></html>"#.to_vec(),
858                    "text/html; charset=utf-8",
859                ))
860                .mount(&server)
861                .await;
862
863            let agent = build_agent("test/1.0", Duration::from_secs(5));
864            let seed = Url::parse(&server.uri()).unwrap();
865            let html = spawn_blocking({
866                let seed = seed.clone();
867                move || fetch_html(&agent, &seed)
868            })
869            .await
870            .unwrap()
871            .unwrap();
872
873            let links = extract_links(&html, &seed);
874            assert_eq!(links.len(), 1);
875        }
876
877        async fn check_run(server: &MockServer, configure: impl FnOnce(&mut MapConfig)) -> Vec<MapEntry> {
878            let mut config = MapConfig {
879                seed: Url::parse(&server.uri()).unwrap(),
880                limit: 100,
881                include: None,
882                exclude: None,
883                user_agent: Some("test-bot".into()),
884                timeout: Duration::from_secs(5),
885                no_fallback: false,
886            };
887            configure(&mut config);
888            let mut entries = Vec::new();
889            run(&config, |e| {
890                entries.push(MapEntry {
891                    url: e.url.clone(),
892                    lastmod: e.lastmod.clone(),
893                });
894            })
895            .await;
896            entries
897        }
898
899        #[tokio::test]
900        async fn run_discovers_urls_from_sitemap() {
901            let server = MockServer::start().await;
902            Mock::given(method("GET"))
903                .and(path("/robots.txt"))
904                .respond_with(ResponseTemplate::new(200).set_body_string("User-agent: *\nAllow: /"))
905                .mount(&server)
906                .await;
907            let sitemap = format!(
908                "<urlset><url><loc>{}/page1</loc></url><url><loc>{}/page2</loc></url></urlset>",
909                server.uri(),
910                server.uri()
911            );
912            Mock::given(method("GET"))
913                .and(path("/sitemap.xml"))
914                .respond_with(ResponseTemplate::new(200).set_body_string(sitemap))
915                .mount(&server)
916                .await;
917
918            let entries = check_run(&server, |_| {}).await;
919            assert_eq!(entries.len(), 2);
920            assert!(entries.iter().any(|e| e.url.ends_with("/page1")));
921            assert!(entries.iter().any(|e| e.url.ends_with("/page2")));
922        }
923
924        #[tokio::test]
925        async fn run_respects_limit() {
926            let server = MockServer::start().await;
927            Mock::given(method("GET"))
928                .and(path("/robots.txt"))
929                .respond_with(ResponseTemplate::new(404))
930                .mount(&server)
931                .await;
932            let sitemap = format!(
933                "<urlset><url><loc>{}/a</loc></url><url><loc>{}/b</loc></url><url><loc>{}/c</loc></url></urlset>",
934                server.uri(),
935                server.uri(),
936                server.uri()
937            );
938            Mock::given(method("GET"))
939                .and(path("/sitemap.xml"))
940                .respond_with(ResponseTemplate::new(200).set_body_string(sitemap))
941                .mount(&server)
942                .await;
943
944            let entries = check_run(&server, |c| c.limit = 2).await;
945            assert_eq!(entries.len(), 2);
946        }
947
948        #[tokio::test]
949        async fn run_follows_sitemap_index() {
950            let server = MockServer::start().await;
951            Mock::given(method("GET"))
952                .and(path("/robots.txt"))
953                .respond_with(ResponseTemplate::new(404))
954                .mount(&server)
955                .await;
956            let index = format!(
957                "<sitemapindex><sitemap><loc>{}/sub.xml</loc></sitemap></sitemapindex>",
958                server.uri()
959            );
960            Mock::given(method("GET"))
961                .and(path("/sitemap.xml"))
962                .respond_with(ResponseTemplate::new(200).set_body_string(index))
963                .mount(&server)
964                .await;
965            let sub = format!("<urlset><url><loc>{}/deep</loc></url></urlset>", server.uri());
966            Mock::given(method("GET"))
967                .and(path("/sub.xml"))
968                .respond_with(ResponseTemplate::new(200).set_body_string(sub))
969                .mount(&server)
970                .await;
971
972            let entries = check_run(&server, |_| {}).await;
973            assert_eq!(entries.len(), 1);
974            assert!(entries[0].url.ends_with("/deep"));
975        }
976
977        #[tokio::test]
978        async fn run_falls_back_to_html_links() {
979            let server = MockServer::start().await;
980            Mock::given(method("GET"))
981                .and(path("/robots.txt"))
982                .respond_with(ResponseTemplate::new(404))
983                .mount(&server)
984                .await;
985            Mock::given(method("GET"))
986                .and(path("/sitemap.xml"))
987                .respond_with(ResponseTemplate::new(404))
988                .mount(&server)
989                .await;
990            let html = format!(
991                r#"<html><body><a href="{}/link1">L1</a><a href="{}/link2">L2</a></body></html>"#,
992                server.uri(),
993                server.uri()
994            );
995            Mock::given(method("GET"))
996                .and(path("/"))
997                .respond_with(ResponseTemplate::new(200).set_body_string(html))
998                .mount(&server)
999                .await;
1000
1001            let entries = check_run(&server, |_| {}).await;
1002            assert_eq!(entries.len(), 2);
1003        }
1004
1005        #[tokio::test]
1006        async fn run_no_fallback_skips_html() {
1007            let server = MockServer::start().await;
1008            Mock::given(method("GET"))
1009                .and(path("/robots.txt"))
1010                .respond_with(ResponseTemplate::new(404))
1011                .mount(&server)
1012                .await;
1013            Mock::given(method("GET"))
1014                .and(path("/sitemap.xml"))
1015                .respond_with(ResponseTemplate::new(404))
1016                .mount(&server)
1017                .await;
1018            Mock::given(method("GET"))
1019                .and(path("/"))
1020                .respond_with(
1021                    ResponseTemplate::new(200).set_body_string(r#"<html><body><a href="/link">L</a></body></html>"#),
1022                )
1023                .mount(&server)
1024                .await;
1025
1026            let entries = check_run(&server, |c| c.no_fallback = true).await;
1027            assert_eq!(entries.len(), 0);
1028        }
1029
1030        #[tokio::test]
1031        async fn run_deduplicates_urls() {
1032            let server = MockServer::start().await;
1033            Mock::given(method("GET"))
1034                .and(path("/robots.txt"))
1035                .respond_with(ResponseTemplate::new(404))
1036                .mount(&server)
1037                .await;
1038            let sitemap = format!(
1039                "<urlset><url><loc>{}/dup</loc></url><url><loc>{}/dup</loc></url><url><loc>{}/unique</loc></url></urlset>",
1040                server.uri(),
1041                server.uri(),
1042                server.uri()
1043            );
1044            Mock::given(method("GET"))
1045                .and(path("/sitemap.xml"))
1046                .respond_with(ResponseTemplate::new(200).set_body_string(sitemap))
1047                .mount(&server)
1048                .await;
1049
1050            let entries = check_run(&server, |_| {}).await;
1051            assert_eq!(entries.len(), 2);
1052        }
1053    }
1054}