Skip to main content

servo_fetch/
map.rs

1//! URL discovery via sitemap parsing — no rendering required.
2
3use std::collections::{HashSet, VecDeque};
4use std::io::Read as _;
5use std::time::{Duration, Instant};
6
7use url::Url;
8
9use crate::bridge;
10use crate::net;
11use crate::robots::{RobotsPolicy, RobotsRules};
12use crate::scope::{is_same_site, matches_scope, normalize_url};
13
14const MAP_SITEMAP_MAX_BYTES: u64 = 50 * 1024 * 1024;
15const MAP_SITEMAP_MAX_DECOMPRESSED: u64 = 10 * 1024 * 1024;
16const MAP_GZIP_MAX_RATIO: u64 = 100;
17const MAP_HTML_MAX_BYTES: u64 = 2 * 1024 * 1024;
18const MAP_MAX_REDIRECTS: u8 = 5;
19const MAP_MAX_SITEMAPS: usize = 200;
20const MAP_MAX_INDEX_DEPTH: u8 = 5;
21const MAP_MIN_FETCH_INTERVAL: Duration = Duration::from_millis(500);
22const MAP_URL_MAX_LEN: usize = 2048;
23const HTML_SNIFF_LEN: usize = 100;
24
25/// Options for URL discovery (sitemap + link extraction, no rendering).
26#[must_use = "options do nothing until passed to map()"]
27#[derive(Debug, Clone)]
28pub struct MapOptions {
29    url: String,
30    limit: usize,
31    include: Vec<String>,
32    exclude: Vec<String>,
33    user_agent: Option<String>,
34    timeout: u64,
35    no_fallback: bool,
36}
37
38impl MapOptions {
39    /// Create map options for the given URL.
40    pub fn new(url: impl Into<String>) -> Self {
41        Self {
42            url: url.into(),
43            limit: 5000,
44            include: Vec::new(),
45            exclude: Vec::new(),
46            user_agent: None,
47            timeout: 30,
48            no_fallback: false,
49        }
50    }
51
52    /// Maximum number of URLs to discover.
53    pub fn limit(mut self, n: usize) -> Self {
54        self.limit = n;
55        self
56    }
57
58    /// URL path glob patterns to include.
59    pub fn include(mut self, patterns: &[&str]) -> Self {
60        self.include = patterns.iter().map(|s| (*s).to_string()).collect();
61        self
62    }
63
64    /// URL path glob patterns to exclude.
65    pub fn exclude(mut self, patterns: &[&str]) -> Self {
66        self.exclude = patterns.iter().map(|s| (*s).to_string()).collect();
67        self
68    }
69
70    /// Override the User-Agent string.
71    pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
72        self.user_agent = Some(ua.into());
73        self
74    }
75
76    /// Timeout in seconds per HTTP request.
77    pub fn timeout(mut self, secs: u64) -> Self {
78        self.timeout = secs;
79        self
80    }
81
82    /// Skip HTML link fallback if no sitemap is found.
83    pub fn no_fallback(mut self, yes: bool) -> Self {
84        self.no_fallback = yes;
85        self
86    }
87}
88
89/// A discovered URL from sitemap or link extraction.
90#[derive(Debug, Clone, serde::Serialize)]
91pub struct MappedUrl {
92    /// The discovered URL.
93    pub url: String,
94    /// Last modification date from sitemap, if available.
95    #[serde(skip_serializing_if = "Option::is_none")]
96    pub lastmod: Option<String>,
97}
98
99/// Discover URLs on a site via sitemaps and link extraction (no rendering).
100#[allow(clippy::needless_pass_by_value)]
101pub fn map(opts: MapOptions) -> crate::error::Result<Vec<MappedUrl>> {
102    net::ensure_crypto_provider();
103    let seed = net::validate_url(&opts.url)?;
104
105    let include = if opts.include.is_empty() {
106        None
107    } else {
108        Some(crate::scope::build_globset(&opts.include)?)
109    };
110    let exclude = if opts.exclude.is_empty() {
111        None
112    } else {
113        Some(crate::scope::build_globset(&opts.exclude)?)
114    };
115
116    let internal = MapConfig {
117        seed,
118        limit: opts.limit,
119        include,
120        exclude,
121        user_agent: opts.user_agent,
122        timeout: Duration::from_secs(opts.timeout),
123        no_fallback: opts.no_fallback,
124    };
125
126    let mut results = Vec::new();
127    crate::runtime::block_on(run(&internal, |entry| {
128        results.push(MappedUrl {
129            url: entry.url.clone(),
130            lastmod: entry.lastmod.clone(),
131        });
132    }))
133    .map_err(|e| crate::error::Error::Engine(e.to_string()))?;
134    Ok(results)
135}
136
137/// Options for URL discovery.
138pub(crate) struct MapConfig {
139    pub seed: Url,
140    pub limit: usize,
141    pub include: Option<globset::GlobSet>,
142    pub exclude: Option<globset::GlobSet>,
143    pub user_agent: Option<String>,
144    pub timeout: Duration,
145    pub no_fallback: bool,
146}
147
148/// A discovered URL with optional metadata.
149#[derive(serde::Serialize)]
150pub(crate) struct MapEntry {
151    pub url: String,
152    #[serde(skip_serializing_if = "Option::is_none")]
153    pub lastmod: Option<String>,
154}
155
156/// Run URL discovery for a site.
157pub(crate) async fn run(opts: &MapConfig, mut on_url: impl FnMut(&MapEntry)) {
158    let ua = opts
159        .user_agent
160        .as_deref()
161        .unwrap_or_else(|| bridge::default_user_agent());
162    let agent = build_agent(ua, opts.timeout);
163
164    let robots = {
165        let seed = opts.seed.clone();
166        let user_agent = opts.user_agent.clone();
167        let timeout = opts.timeout;
168        tokio::task::spawn_blocking(move || RobotsRules::fetch(&seed, user_agent.as_deref(), timeout))
169            .await
170            .unwrap_or(RobotsPolicy::Unreachable)
171    };
172
173    let mut visited = HashSet::new();
174    let mut count = 0;
175    let mut last_fetch = Instant::now()
176        .checked_sub(MAP_MIN_FETCH_INTERVAL)
177        .unwrap_or_else(Instant::now);
178    let mut sitemap_queue: VecDeque<(Url, u8)> = discover_sitemaps(&robots, &opts.seed)
179        .into_iter()
180        .map(|u| (u, 0))
181        .collect();
182    let mut sitemaps_fetched = 0;
183
184    while let Some((sitemap_url, depth)) = sitemap_queue.pop_front() {
185        if sitemaps_fetched >= MAP_MAX_SITEMAPS || count >= opts.limit {
186            break;
187        }
188        if depth > MAP_MAX_INDEX_DEPTH || !is_same_site(&opts.seed, &sitemap_url) {
189            continue;
190        }
191
192        throttle(&mut last_fetch).await;
193        sitemaps_fetched += 1;
194
195        let body = {
196            let agent = agent.clone();
197            tokio::task::spawn_blocking({
198                let seed = opts.seed.clone();
199                move || fetch_sitemap(&agent, &sitemap_url, &seed)
200            })
201            .await
202            .ok()
203            .flatten()
204        };
205        let Some(body) = body else { continue };
206
207        for entry in parse_sitemap(&body) {
208            match entry {
209                SitemapEntry::Url { loc, lastmod } => {
210                    if count >= opts.limit {
211                        break;
212                    }
213                    if let Some(e) = validate_entry(&loc, lastmod, &opts.seed, &robots, opts, &mut visited) {
214                        on_url(&e);
215                        count += 1;
216                    }
217                }
218                SitemapEntry::Sitemap { loc } => {
219                    if let Ok(url) = Url::parse(&loc) {
220                        sitemap_queue.push_back((url, depth + 1));
221                    }
222                }
223            }
224        }
225    }
226
227    if count == 0 && !opts.no_fallback {
228        throttle(&mut last_fetch).await;
229        let html = {
230            let agent = agent.clone();
231            let seed = opts.seed.clone();
232            tokio::task::spawn_blocking(move || fetch_html(&agent, &seed))
233                .await
234                .ok()
235                .flatten()
236        };
237        if let Some(html) = html {
238            for link in extract_links(&html, &opts.seed) {
239                if count >= opts.limit {
240                    break;
241                }
242                if let Some(e) = validate_entry(link.as_str(), None, &opts.seed, &robots, opts, &mut visited) {
243                    on_url(&e);
244                    count += 1;
245                }
246            }
247        }
248    }
249}
250
251fn discover_sitemaps(robots: &RobotsPolicy, seed: &Url) -> Vec<Url> {
252    let mut urls = Vec::new();
253    if let RobotsPolicy::Rules(rules) = robots {
254        urls.extend(rules.sitemaps.iter().cloned());
255    }
256    if let Ok(default) = seed.join("/sitemap.xml") {
257        if !urls.contains(&default) {
258            urls.push(default);
259        }
260    }
261    urls
262}
263
264fn build_agent(ua: &str, timeout: Duration) -> ureq::Agent {
265    ureq::Agent::new_with_config(
266        ureq::config::Config::builder()
267            .max_redirects(0)
268            .http_status_as_error(false)
269            .timeout_global(Some(timeout))
270            .user_agent(ua)
271            .build(),
272    )
273}
274
275fn fetch_following_redirects(agent: &ureq::Agent, url: &Url, seed: &Url) -> Option<ureq::http::Response<ureq::Body>> {
276    let mut current = url.clone();
277    for _ in 0..MAP_MAX_REDIRECTS {
278        let resp = agent.get(current.as_str()).call().ok()?;
279        let status = resp.status().as_u16();
280        if matches!(status, 301 | 302 | 303 | 307 | 308) {
281            let location = resp.headers().get("location")?.to_str().ok()?;
282            let next = current.join(location).ok()?;
283            if net::validate_url_with_policy(next.as_str(), bridge::engine_policy()).is_err()
284                || !is_same_site(seed, &next)
285            {
286                return None;
287            }
288            current = next;
289            continue;
290        }
291        if status >= 400 {
292            return None;
293        }
294        return Some(resp);
295    }
296    None
297}
298
299fn fetch_sitemap(agent: &ureq::Agent, url: &Url, seed: &Url) -> Option<String> {
300    let resp = fetch_following_redirects(agent, url, seed)?;
301    let content_type = resp
302        .headers()
303        .get("content-type")
304        .and_then(|v| v.to_str().ok())
305        .unwrap_or("");
306
307    let is_gzip = url
308        .path()
309        .rsplit('/')
310        .next()
311        .and_then(|seg| std::path::Path::new(seg).extension())
312        .is_some_and(|ext| ext.eq_ignore_ascii_case("gz"))
313        || content_type.contains("gzip")
314        || resp
315            .headers()
316            .get("content-encoding")
317            .and_then(|v| v.to_str().ok())
318            .is_some_and(|v| v.contains("gzip"));
319
320    if is_gzip {
321        let bytes = resp
322            .into_body()
323            .with_config()
324            .limit(MAP_SITEMAP_MAX_BYTES)
325            .read_to_vec()
326            .ok()?;
327        let mut decoded = Vec::new();
328        flate2::read::GzDecoder::new(bytes.as_slice())
329            .take(MAP_SITEMAP_MAX_DECOMPRESSED)
330            .read_to_end(&mut decoded)
331            .ok()?;
332        if decoded.len() as u64 > bytes.len() as u64 * MAP_GZIP_MAX_RATIO {
333            return None;
334        }
335        if looks_like_html(&decoded) {
336            return None;
337        }
338        String::from_utf8(decoded).ok()
339    } else {
340        let body = resp
341            .into_body()
342            .with_config()
343            .limit(MAP_SITEMAP_MAX_BYTES)
344            .read_to_string()
345            .ok()?;
346        if looks_like_html(body.as_bytes()) {
347            return None;
348        }
349        Some(body)
350    }
351}
352
353fn looks_like_html(bytes: &[u8]) -> bool {
354    const DOCTYPE: &[u8] = b"<!doctype";
355    const HTML: &[u8] = b"<html";
356    const BOM: &[u8] = b"\xef\xbb\xbf";
357    let mut prefix = bytes.get(..HTML_SNIFF_LEN).unwrap_or(bytes);
358    if prefix.starts_with(BOM) {
359        prefix = &prefix[BOM.len()..];
360    }
361    let prefix = prefix
362        .iter()
363        .position(|b| !b.is_ascii_whitespace())
364        .map_or(&[][..], |i| &prefix[i..]);
365    prefix
366        .get(..DOCTYPE.len())
367        .is_some_and(|p| p.eq_ignore_ascii_case(DOCTYPE))
368        || prefix.get(..HTML.len()).is_some_and(|p| p.eq_ignore_ascii_case(HTML))
369}
370
371fn fetch_html(agent: &ureq::Agent, url: &Url) -> Option<String> {
372    let resp = fetch_following_redirects(agent, url, url)?;
373    resp.into_body()
374        .with_config()
375        .limit(MAP_HTML_MAX_BYTES)
376        .read_to_string()
377        .ok()
378}
379
380fn extract_links(html: &str, base: &Url) -> Vec<Url> {
381    dom_query::Document::from(html)
382        .select("a[href]")
383        .iter()
384        .filter_map(|el| {
385            let href = el.attr("href")?;
386            let href = href.trim();
387            if href.is_empty() {
388                return None;
389            }
390            let resolved = base.join(href).ok()?;
391            matches!(resolved.scheme(), "http" | "https").then_some(resolved)
392        })
393        .collect()
394}
395
396enum SitemapEntry {
397    Url { loc: String, lastmod: Option<String> },
398    Sitemap { loc: String },
399}
400
401fn parse_sitemap(body: &str) -> Vec<SitemapEntry> {
402    use quick_xml::events::Event;
403    use quick_xml::reader::Reader;
404
405    let mut reader = Reader::from_str(body);
406    let mut entries = Vec::new();
407    let mut buf = Vec::new();
408    let mut capture = Capture::Idle;
409    let mut loc = String::new();
410    let mut lastmod = String::new();
411    let mut in_url = false;
412    let mut in_sitemap = false;
413    let mut depth: u32 = 0;
414
415    loop {
416        match reader.read_event_into(&mut buf) {
417            Ok(Event::Start(e)) => {
418                let name = e.local_name();
419                match name.as_ref() {
420                    b"url" => {
421                        in_url = true;
422                        depth = 0;
423                    }
424                    b"sitemap" => {
425                        in_sitemap = true;
426                        depth = 0;
427                    }
428                    b"loc" if (in_url || in_sitemap) && depth == 0 => capture = Capture::Loc,
429                    b"lastmod" if in_url && depth == 0 => capture = Capture::Lastmod,
430                    _ if in_url || in_sitemap => depth += 1,
431                    _ => {}
432                }
433            }
434            Ok(Event::Text(e)) => {
435                if let Ok(text) = e.xml10_content() {
436                    match capture {
437                        Capture::Loc => loc.push_str(text.trim()),
438                        Capture::Lastmod => lastmod.push_str(text.trim()),
439                        Capture::Idle => {}
440                    }
441                } else {
442                    loc.clear();
443                    lastmod.clear();
444                    capture = Capture::Idle;
445                }
446            }
447            Ok(Event::GeneralRef(e)) => {
448                let resolved = match &*e {
449                    b"amp" => "&",
450                    b"lt" => "<",
451                    b"gt" => ">",
452                    b"quot" => "\"",
453                    b"apos" => "'",
454                    _ => "",
455                };
456                match capture {
457                    Capture::Loc => loc.push_str(resolved),
458                    Capture::Lastmod => lastmod.push_str(resolved),
459                    Capture::Idle => {}
460                }
461            }
462            Ok(Event::End(e)) => {
463                let name = e.local_name();
464                match name.as_ref() {
465                    b"url" if in_url => {
466                        if !loc.is_empty() {
467                            let lm = if lastmod.is_empty() {
468                                None
469                            } else {
470                                Some(std::mem::take(&mut lastmod))
471                            };
472                            entries.push(SitemapEntry::Url {
473                                loc: std::mem::take(&mut loc),
474                                lastmod: lm,
475                            });
476                        }
477                        loc.clear();
478                        lastmod.clear();
479                        in_url = false;
480                    }
481                    b"sitemap" if in_sitemap => {
482                        if !loc.is_empty() {
483                            entries.push(SitemapEntry::Sitemap {
484                                loc: std::mem::take(&mut loc),
485                            });
486                        }
487                        loc.clear();
488                        lastmod.clear();
489                        in_sitemap = false;
490                    }
491                    b"loc" | b"lastmod" if capture != Capture::Idle => capture = Capture::Idle,
492                    _ if depth > 0 => depth -= 1,
493                    _ => {}
494                }
495            }
496            Ok(Event::Eof) | Err(_) => break,
497            _ => {}
498        }
499        buf.clear();
500    }
501
502    entries
503}
504
505#[derive(Clone, Copy, Debug, PartialEq, Eq)]
506enum Capture {
507    Idle,
508    Loc,
509    Lastmod,
510}
511
512fn validate_entry(
513    loc: &str,
514    lastmod: Option<String>,
515    seed: &Url,
516    robots: &RobotsPolicy,
517    opts: &MapConfig,
518    visited: &mut HashSet<String>,
519) -> Option<MapEntry> {
520    if loc.len() > MAP_URL_MAX_LEN {
521        return None;
522    }
523    let url = Url::parse(loc)
524        .ok()
525        .filter(|u| matches!(u.scheme(), "http" | "https"))?;
526    if !is_same_site(seed, &url) {
527        return None;
528    }
529    if !robots.is_allowed(&url) {
530        return None;
531    }
532    if !matches_scope(&url, opts.include.as_ref(), opts.exclude.as_ref()) {
533        return None;
534    }
535    let normalized = normalize_url(&url);
536    if !visited.insert(normalized.clone()) {
537        return None;
538    }
539    Some(MapEntry {
540        url: normalized,
541        lastmod,
542    })
543}
544
545async fn throttle(last_fetch: &mut Instant) {
546    let elapsed = last_fetch.elapsed();
547    if elapsed < MAP_MIN_FETCH_INTERVAL {
548        tokio::time::sleep(MAP_MIN_FETCH_INTERVAL.saturating_sub(elapsed)).await;
549    }
550    *last_fetch = Instant::now();
551}
552
553#[cfg(test)]
554mod tests {
555    use super::*;
556
557    fn test_config(seed: &str) -> MapConfig {
558        MapConfig {
559            seed: Url::parse(seed).unwrap(),
560            limit: 100,
561            include: None,
562            exclude: None,
563            user_agent: None,
564            timeout: Duration::from_secs(30),
565            no_fallback: false,
566        }
567    }
568
569    #[test]
570    fn parse_urlset() {
571        let xml = r#"<?xml version="1.0"?>
572<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
573  <url><loc>https://example.com/a</loc><lastmod>2026-01-01</lastmod></url>
574  <url><loc>https://example.com/b</loc></url>
575</urlset>"#;
576        let entries = parse_sitemap(xml);
577        assert_eq!(entries.len(), 2);
578        match &entries[0] {
579            SitemapEntry::Url { loc, lastmod } => {
580                assert_eq!(loc, "https://example.com/a");
581                assert_eq!(lastmod.as_deref(), Some("2026-01-01"));
582            }
583            SitemapEntry::Sitemap { .. } => panic!("expected Url"),
584        }
585        match &entries[1] {
586            SitemapEntry::Url { loc, lastmod } => {
587                assert_eq!(loc, "https://example.com/b");
588                assert!(lastmod.is_none());
589            }
590            SitemapEntry::Sitemap { .. } => panic!("expected Url"),
591        }
592    }
593
594    #[test]
595    fn parse_sitemapindex() {
596        let xml = r#"<?xml version="1.0"?>
597<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
598  <sitemap><loc>https://example.com/sitemap1.xml</loc></sitemap>
599  <sitemap><loc>https://example.com/sitemap2.xml</loc></sitemap>
600</sitemapindex>"#;
601        let entries = parse_sitemap(xml);
602        assert_eq!(entries.len(), 2);
603        match &entries[0] {
604            SitemapEntry::Sitemap { loc } => assert_eq!(loc, "https://example.com/sitemap1.xml"),
605            SitemapEntry::Url { .. } => panic!("expected Sitemap"),
606        }
607    }
608
609    #[test]
610    fn parse_handles_xml_entities() {
611        let xml = r"<urlset><url><loc>https://example.com/a?b=1&amp;c=2</loc></url></urlset>";
612        let entries = parse_sitemap(xml);
613        assert_eq!(entries.len(), 1);
614        match &entries[0] {
615            SitemapEntry::Url { loc, .. } => assert_eq!(loc, "https://example.com/a?b=1&c=2"),
616            SitemapEntry::Sitemap { .. } => panic!("expected Url"),
617        }
618    }
619
620    #[test]
621    fn parse_handles_namespaced_tags() {
622        let xml = r#"<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
623                xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">
624  <url>
625    <loc>https://example.com/page</loc>
626    <image:image><image:loc>https://example.com/img.png</image:loc></image:image>
627  </url>
628</urlset>"#;
629        let entries = parse_sitemap(xml);
630        assert_eq!(entries.len(), 1);
631        match &entries[0] {
632            SitemapEntry::Url { loc, .. } => assert_eq!(loc, "https://example.com/page"),
633            SitemapEntry::Sitemap { .. } => panic!("expected Url"),
634        }
635    }
636
637    #[test]
638    fn parse_loc_after_nested_extension() {
639        let xml = r#"<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
640                xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">
641  <url>
642    <image:image><image:loc>https://example.com/img.png</image:loc></image:image>
643    <loc>https://example.com/page</loc>
644    <lastmod>2026-01-01</lastmod>
645  </url>
646</urlset>"#;
647        let entries = parse_sitemap(xml);
648        assert_eq!(entries.len(), 1);
649        match &entries[0] {
650            SitemapEntry::Url { loc, lastmod } => {
651                assert_eq!(loc, "https://example.com/page");
652                assert_eq!(lastmod.as_deref(), Some("2026-01-01"));
653            }
654            SitemapEntry::Sitemap { .. } => panic!("expected Url"),
655        }
656    }
657
658    #[test]
659    fn parse_empty_body_returns_empty() {
660        assert!(parse_sitemap("").is_empty());
661        assert!(parse_sitemap("<html><body>Not Found</body></html>").is_empty());
662    }
663
664    #[test]
665    fn looks_like_html_detects_variants() {
666        assert!(looks_like_html(b"<!DOCTYPE html>"));
667        assert!(looks_like_html(b"<!doctype html>"));
668        assert!(looks_like_html(b"<html lang=\"en\">"));
669        assert!(looks_like_html(b"<HTML>"));
670        assert!(looks_like_html(b"\xef\xbb\xbf<!DOCTYPE html>"));
671        assert!(looks_like_html(b"  \n<!doctype html>"));
672        assert!(looks_like_html(b"\xef\xbb\xbf  <html>"));
673        assert!(!looks_like_html(b"<?xml version=\"1.0\"?>"));
674        assert!(!looks_like_html(b"<urlset>"));
675        assert!(!looks_like_html(b""));
676    }
677
678    #[test]
679    fn validate_entry_rejects_private_ip() {
680        let opts = test_config("https://example.com");
681
682        let mut visited = HashSet::new();
683        let result = validate_entry(
684            "http://127.0.0.1/secret",
685            None,
686            &opts.seed,
687            &RobotsPolicy::Unavailable,
688            &opts,
689            &mut visited,
690        );
691        assert!(result.is_none());
692    }
693
694    #[test]
695    fn validate_entry_rejects_cross_site() {
696        let opts = test_config("https://example.com");
697        let robots = RobotsPolicy::Unavailable;
698        let mut visited = HashSet::new();
699        let result = validate_entry("https://evil.com/page", None, &opts.seed, &robots, &opts, &mut visited);
700        assert!(result.is_none());
701    }
702
703    #[test]
704    fn validate_entry_deduplicates() {
705        let opts = test_config("https://example.com");
706        let robots = RobotsPolicy::Unavailable;
707        let mut visited = HashSet::new();
708        let first = validate_entry(
709            "https://example.com/page",
710            None,
711            &opts.seed,
712            &robots,
713            &opts,
714            &mut visited,
715        );
716        assert!(first.is_some());
717        let second = validate_entry(
718            "https://example.com/page",
719            None,
720            &opts.seed,
721            &robots,
722            &opts,
723            &mut visited,
724        );
725        assert!(second.is_none());
726    }
727
728    #[test]
729    fn validate_entry_rejects_long_url() {
730        let opts = test_config("https://example.com");
731        let robots = RobotsPolicy::Unavailable;
732        let mut visited = HashSet::new();
733        let long_url = format!("https://example.com/{}", "a".repeat(MAP_URL_MAX_LEN));
734        let result = validate_entry(&long_url, None, &opts.seed, &robots, &opts, &mut visited);
735        assert!(result.is_none());
736    }
737
738    #[test]
739    fn discover_sitemaps_includes_robots_and_default() {
740        let seed = Url::parse("https://example.com").unwrap();
741        let robots = RobotsPolicy::Rules(RobotsRules {
742            rules: Vec::new(),
743            sitemaps: vec![Url::parse("https://example.com/custom-sitemap.xml").unwrap()],
744        });
745        let sitemaps = discover_sitemaps(&robots, &seed);
746        assert_eq!(sitemaps.len(), 2);
747        assert_eq!(sitemaps[0].as_str(), "https://example.com/custom-sitemap.xml");
748        assert_eq!(sitemaps[1].as_str(), "https://example.com/sitemap.xml");
749    }
750
751    #[test]
752    fn discover_sitemaps_deduplicates_default() {
753        let seed = Url::parse("https://example.com").unwrap();
754        let robots = RobotsPolicy::Rules(RobotsRules {
755            rules: Vec::new(),
756            sitemaps: vec![Url::parse("https://example.com/sitemap.xml").unwrap()],
757        });
758        let sitemaps = discover_sitemaps(&robots, &seed);
759        assert_eq!(sitemaps.len(), 1);
760    }
761
762    mod integration {
763        use crate::map::{
764            MapConfig, MapEntry, build_agent, extract_links, fetch_html, fetch_sitemap, parse_sitemap, run,
765        };
766        use std::time::Duration;
767        use url::Url;
768        use wiremock::matchers::{method, path};
769        use wiremock::{Mock, MockServer, ResponseTemplate};
770
771        #[tokio::test]
772        async fn fetch_sitemap_parses_urlset() {
773            let server = MockServer::start().await;
774            let xml = r#"<?xml version="1.0"?><urlset><url><loc>https://example.com/a</loc></url></urlset>"#;
775            Mock::given(method("GET"))
776                .and(path("/sitemap.xml"))
777                .respond_with(ResponseTemplate::new(200).set_body_raw(xml.as_bytes().to_vec(), "application/xml"))
778                .mount(&server)
779                .await;
780
781            let agent = build_agent("test/1.0", Duration::from_secs(5));
782            let url = Url::parse(&format!("{}/sitemap.xml", server.uri())).unwrap();
783            let body = tokio::task::spawn_blocking(move || fetch_sitemap(&agent, &url, &url))
784                .await
785                .unwrap();
786
787            let entries = parse_sitemap(&body.unwrap());
788            assert_eq!(entries.len(), 1);
789        }
790
791        #[tokio::test]
792        async fn fetch_sitemap_rejects_html_error_page() {
793            let server = MockServer::start().await;
794            Mock::given(method("GET"))
795                .and(path("/sitemap.xml"))
796                .respond_with(ResponseTemplate::new(200).set_body_raw(
797                    b"<!DOCTYPE html><html><body>Not Found</body></html>".to_vec(),
798                    "text/html; charset=utf-8",
799                ))
800                .mount(&server)
801                .await;
802
803            let agent = build_agent("test/1.0", Duration::from_secs(5));
804            let url = Url::parse(&format!("{}/sitemap.xml", server.uri())).unwrap();
805            let body = tokio::task::spawn_blocking(move || fetch_sitemap(&agent, &url, &url))
806                .await
807                .unwrap();
808
809            assert!(body.is_none());
810        }
811
812        #[tokio::test]
813        async fn fetch_sitemap_returns_none_on_404() {
814            let server = MockServer::start().await;
815            Mock::given(method("GET"))
816                .and(path("/sitemap.xml"))
817                .respond_with(ResponseTemplate::new(404))
818                .mount(&server)
819                .await;
820
821            let agent = build_agent("test/1.0", Duration::from_secs(5));
822            let url = Url::parse(&format!("{}/sitemap.xml", server.uri())).unwrap();
823            let body = tokio::task::spawn_blocking(move || fetch_sitemap(&agent, &url, &url))
824                .await
825                .unwrap();
826
827            assert!(body.is_none());
828        }
829
830        #[tokio::test]
831        async fn fetch_sitemap_handles_gzip() {
832            use flate2::Compression;
833            use flate2::write::GzEncoder;
834            use std::io::Write;
835
836            let server = MockServer::start().await;
837            let xml = r#"<?xml version="1.0"?><urlset><url><loc>https://example.com/gz</loc></url></urlset>"#;
838            let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
839            encoder.write_all(xml.as_bytes()).unwrap();
840            let compressed = encoder.finish().unwrap();
841
842            Mock::given(method("GET"))
843                .and(path("/sitemap.xml.gz"))
844                .respond_with(ResponseTemplate::new(200).set_body_raw(compressed, "application/gzip"))
845                .mount(&server)
846                .await;
847
848            let agent = build_agent("test/1.0", Duration::from_secs(5));
849            let url = Url::parse(&format!("{}/sitemap.xml.gz", server.uri())).unwrap();
850            let body = tokio::task::spawn_blocking(move || fetch_sitemap(&agent, &url, &url))
851                .await
852                .unwrap();
853
854            let entries = parse_sitemap(&body.unwrap());
855            assert_eq!(entries.len(), 1);
856        }
857
858        #[tokio::test]
859        async fn fetch_html_extracts_links() {
860            let server = MockServer::start().await;
861            Mock::given(method("GET"))
862                .and(path("/"))
863                .respond_with(ResponseTemplate::new(200).set_body_raw(
864                    br#"<html><body><a href="/link">x</a></body></html>"#.to_vec(),
865                    "text/html; charset=utf-8",
866                ))
867                .mount(&server)
868                .await;
869
870            let agent = build_agent("test/1.0", Duration::from_secs(5));
871            let seed = Url::parse(&server.uri()).unwrap();
872            let html = tokio::task::spawn_blocking({
873                let seed = seed.clone();
874                move || fetch_html(&agent, &seed)
875            })
876            .await
877            .unwrap()
878            .unwrap();
879
880            let links = extract_links(&html, &seed);
881            assert_eq!(links.len(), 1);
882        }
883
884        async fn check_run(server: &MockServer, configure: impl FnOnce(&mut MapConfig)) -> Vec<MapEntry> {
885            let mut config = MapConfig {
886                seed: Url::parse(&server.uri()).unwrap(),
887                limit: 100,
888                include: None,
889                exclude: None,
890                user_agent: Some("test-bot".into()),
891                timeout: Duration::from_secs(5),
892                no_fallback: false,
893            };
894            configure(&mut config);
895            let mut entries = Vec::new();
896            run(&config, |e| {
897                entries.push(MapEntry {
898                    url: e.url.clone(),
899                    lastmod: e.lastmod.clone(),
900                });
901            })
902            .await;
903            entries
904        }
905
906        #[tokio::test]
907        async fn run_discovers_urls_from_sitemap() {
908            let server = MockServer::start().await;
909            Mock::given(method("GET"))
910                .and(path("/robots.txt"))
911                .respond_with(ResponseTemplate::new(200).set_body_string("User-agent: *\nAllow: /"))
912                .mount(&server)
913                .await;
914            let sitemap = format!(
915                "<urlset><url><loc>{}/page1</loc></url><url><loc>{}/page2</loc></url></urlset>",
916                server.uri(),
917                server.uri()
918            );
919            Mock::given(method("GET"))
920                .and(path("/sitemap.xml"))
921                .respond_with(ResponseTemplate::new(200).set_body_string(sitemap))
922                .mount(&server)
923                .await;
924
925            let entries = check_run(&server, |_| {}).await;
926            assert_eq!(entries.len(), 2);
927            assert!(entries.iter().any(|e| e.url.ends_with("/page1")));
928            assert!(entries.iter().any(|e| e.url.ends_with("/page2")));
929        }
930
931        #[tokio::test]
932        async fn run_respects_limit() {
933            let server = MockServer::start().await;
934            Mock::given(method("GET"))
935                .and(path("/robots.txt"))
936                .respond_with(ResponseTemplate::new(404))
937                .mount(&server)
938                .await;
939            let sitemap = format!(
940                "<urlset><url><loc>{}/a</loc></url><url><loc>{}/b</loc></url><url><loc>{}/c</loc></url></urlset>",
941                server.uri(),
942                server.uri(),
943                server.uri()
944            );
945            Mock::given(method("GET"))
946                .and(path("/sitemap.xml"))
947                .respond_with(ResponseTemplate::new(200).set_body_string(sitemap))
948                .mount(&server)
949                .await;
950
951            let entries = check_run(&server, |c| c.limit = 2).await;
952            assert_eq!(entries.len(), 2);
953        }
954
955        #[tokio::test]
956        async fn run_follows_sitemap_index() {
957            let server = MockServer::start().await;
958            Mock::given(method("GET"))
959                .and(path("/robots.txt"))
960                .respond_with(ResponseTemplate::new(404))
961                .mount(&server)
962                .await;
963            let index = format!(
964                "<sitemapindex><sitemap><loc>{}/sub.xml</loc></sitemap></sitemapindex>",
965                server.uri()
966            );
967            Mock::given(method("GET"))
968                .and(path("/sitemap.xml"))
969                .respond_with(ResponseTemplate::new(200).set_body_string(index))
970                .mount(&server)
971                .await;
972            let sub = format!("<urlset><url><loc>{}/deep</loc></url></urlset>", server.uri());
973            Mock::given(method("GET"))
974                .and(path("/sub.xml"))
975                .respond_with(ResponseTemplate::new(200).set_body_string(sub))
976                .mount(&server)
977                .await;
978
979            let entries = check_run(&server, |_| {}).await;
980            assert_eq!(entries.len(), 1);
981            assert!(entries[0].url.ends_with("/deep"));
982        }
983
984        #[tokio::test]
985        async fn run_falls_back_to_html_links() {
986            let server = MockServer::start().await;
987            Mock::given(method("GET"))
988                .and(path("/robots.txt"))
989                .respond_with(ResponseTemplate::new(404))
990                .mount(&server)
991                .await;
992            Mock::given(method("GET"))
993                .and(path("/sitemap.xml"))
994                .respond_with(ResponseTemplate::new(404))
995                .mount(&server)
996                .await;
997            let html = format!(
998                r#"<html><body><a href="{}/link1">L1</a><a href="{}/link2">L2</a></body></html>"#,
999                server.uri(),
1000                server.uri()
1001            );
1002            Mock::given(method("GET"))
1003                .and(path("/"))
1004                .respond_with(ResponseTemplate::new(200).set_body_string(html))
1005                .mount(&server)
1006                .await;
1007
1008            let entries = check_run(&server, |_| {}).await;
1009            assert_eq!(entries.len(), 2);
1010        }
1011
1012        #[tokio::test]
1013        async fn run_no_fallback_skips_html() {
1014            let server = MockServer::start().await;
1015            Mock::given(method("GET"))
1016                .and(path("/robots.txt"))
1017                .respond_with(ResponseTemplate::new(404))
1018                .mount(&server)
1019                .await;
1020            Mock::given(method("GET"))
1021                .and(path("/sitemap.xml"))
1022                .respond_with(ResponseTemplate::new(404))
1023                .mount(&server)
1024                .await;
1025            Mock::given(method("GET"))
1026                .and(path("/"))
1027                .respond_with(
1028                    ResponseTemplate::new(200).set_body_string(r#"<html><body><a href="/link">L</a></body></html>"#),
1029                )
1030                .mount(&server)
1031                .await;
1032
1033            let entries = check_run(&server, |c| c.no_fallback = true).await;
1034            assert_eq!(entries.len(), 0);
1035        }
1036
1037        #[tokio::test]
1038        async fn run_deduplicates_urls() {
1039            let server = MockServer::start().await;
1040            Mock::given(method("GET"))
1041                .and(path("/robots.txt"))
1042                .respond_with(ResponseTemplate::new(404))
1043                .mount(&server)
1044                .await;
1045            let sitemap = format!(
1046                "<urlset><url><loc>{}/dup</loc></url><url><loc>{}/dup</loc></url><url><loc>{}/unique</loc></url></urlset>",
1047                server.uri(),
1048                server.uri(),
1049                server.uri()
1050            );
1051            Mock::given(method("GET"))
1052                .and(path("/sitemap.xml"))
1053                .respond_with(ResponseTemplate::new(200).set_body_string(sitemap))
1054                .mount(&server)
1055                .await;
1056
1057            let entries = check_run(&server, |_| {}).await;
1058            assert_eq!(entries.len(), 2);
1059        }
1060    }
1061}