Skip to main content

servo_fetch/
map.rs

1//! URL discovery via sitemap parsing — no rendering required.
2
3use std::collections::{HashSet, VecDeque};
4use std::io::Read as _;
5use std::time::{Duration, Instant};
6
7use tokio::task::spawn_blocking;
8use url::Url;
9
10use crate::robots::{RobotsPolicy, RobotsRules};
11use crate::scope::{is_same_site, matches_scope, normalize_url};
12use crate::{bridge, net};
13
14const MAP_SITEMAP_MAX_BYTES: u64 = 50 * 1024 * 1024;
15const MAP_SITEMAP_MAX_DECOMPRESSED: u64 = 10 * 1024 * 1024;
16const MAP_GZIP_MAX_RATIO: u64 = 100;
17const MAP_HTML_MAX_BYTES: u64 = 2 * 1024 * 1024;
18const MAP_MAX_REDIRECTS: u8 = 5;
19const MAP_MAX_SITEMAPS: usize = 200;
20const MAP_MAX_INDEX_DEPTH: u8 = 5;
21const MAP_MIN_FETCH_INTERVAL: Duration = Duration::from_millis(500);
22const MAP_URL_MAX_LEN: usize = 2048;
23const HTML_SNIFF_LEN: usize = 100;
24
25/// Options for URL discovery (sitemap + link extraction, no rendering).
26#[must_use = "options do nothing until passed to map()"]
27#[derive(Debug, Clone)]
28pub struct MapOptions {
29    url: String,
30    limit: usize,
31    include: Vec<String>,
32    exclude: Vec<String>,
33    user_agent: Option<String>,
34    timeout: u64,
35    no_fallback: bool,
36}
37
38impl MapOptions {
39    /// Create map options for the given URL.
40    pub fn new(url: impl Into<String>) -> Self {
41        Self {
42            url: url.into(),
43            limit: 5000,
44            include: Vec::new(),
45            exclude: Vec::new(),
46            user_agent: None,
47            timeout: 30,
48            no_fallback: false,
49        }
50    }
51
52    /// Maximum number of URLs to discover.
53    pub fn limit(mut self, n: usize) -> Self {
54        self.limit = n;
55        self
56    }
57
58    /// URL path glob patterns to include.
59    pub fn include(mut self, patterns: &[&str]) -> Self {
60        self.include = patterns.iter().map(|s| (*s).to_string()).collect();
61        self
62    }
63
64    /// URL path glob patterns to exclude.
65    pub fn exclude(mut self, patterns: &[&str]) -> Self {
66        self.exclude = patterns.iter().map(|s| (*s).to_string()).collect();
67        self
68    }
69
70    /// Override the User-Agent string.
71    pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
72        self.user_agent = Some(ua.into());
73        self
74    }
75
76    /// Timeout in seconds per HTTP request.
77    pub fn timeout(mut self, secs: u64) -> Self {
78        self.timeout = secs;
79        self
80    }
81
82    /// Skip HTML link fallback if no sitemap is found.
83    pub fn no_fallback(mut self, yes: bool) -> Self {
84        self.no_fallback = yes;
85        self
86    }
87}
88
89/// A discovered URL from sitemap or link extraction.
90#[derive(Debug, Clone, serde::Serialize)]
91pub struct MappedUrl {
92    /// The discovered URL.
93    pub url: String,
94    /// Last modification date from sitemap, if available.
95    #[serde(skip_serializing_if = "Option::is_none")]
96    pub lastmod: Option<String>,
97}
98
99/// Discover URLs on a site via sitemaps and link extraction (blocking).
100pub fn map_blocking(opts: &MapOptions) -> crate::error::Result<Vec<MappedUrl>> {
101    crate::runtime::block_on(map(opts)).map_err(|e| crate::error::Error::engine(e, None))?
102}
103
104/// Discover URLs on a site via sitemaps and link extraction.
105pub async fn map(opts: &MapOptions) -> crate::error::Result<Vec<MappedUrl>> {
106    net::ensure_crypto_provider();
107    let seed = net::validate_url(&opts.url)?;
108
109    let include = if opts.include.is_empty() {
110        None
111    } else {
112        Some(crate::scope::build_globset(&opts.include)?)
113    };
114    let exclude = if opts.exclude.is_empty() {
115        None
116    } else {
117        Some(crate::scope::build_globset(&opts.exclude)?)
118    };
119
120    let internal = MapConfig {
121        seed,
122        limit: opts.limit,
123        include,
124        exclude,
125        user_agent: opts.user_agent.clone(),
126        timeout: Duration::from_secs(opts.timeout),
127        no_fallback: opts.no_fallback,
128    };
129
130    let mut results = Vec::new();
131    run(&internal, |entry| {
132        results.push(MappedUrl {
133            url: entry.url.clone(),
134            lastmod: entry.lastmod.clone(),
135        });
136    })
137    .await;
138    Ok(results)
139}
140
141/// Options for URL discovery.
142pub(crate) struct MapConfig {
143    pub seed: Url,
144    pub limit: usize,
145    pub include: Option<globset::GlobSet>,
146    pub exclude: Option<globset::GlobSet>,
147    pub user_agent: Option<String>,
148    pub timeout: Duration,
149    pub no_fallback: bool,
150}
151
152/// A discovered URL with optional metadata.
153#[derive(serde::Serialize)]
154pub(crate) struct MapEntry {
155    pub url: String,
156    #[serde(skip_serializing_if = "Option::is_none")]
157    pub lastmod: Option<String>,
158}
159
160/// Run URL discovery for a site.
161pub(crate) async fn run(opts: &MapConfig, mut on_url: impl FnMut(&MapEntry)) {
162    let ua = opts
163        .user_agent
164        .as_deref()
165        .unwrap_or_else(|| bridge::default_user_agent());
166    let agent = build_agent(ua, opts.timeout);
167
168    let robots = {
169        let seed = opts.seed.clone();
170        let user_agent = opts.user_agent.clone();
171        let timeout = opts.timeout;
172        spawn_blocking(move || RobotsRules::fetch(&seed, user_agent.as_deref(), timeout))
173            .await
174            .unwrap_or(RobotsPolicy::Unreachable)
175    };
176
177    let mut visited = HashSet::new();
178    let mut count = 0;
179    let mut last_fetch = Instant::now()
180        .checked_sub(MAP_MIN_FETCH_INTERVAL)
181        .unwrap_or_else(Instant::now);
182    let mut sitemap_queue: VecDeque<(Url, u8)> = discover_sitemaps(&robots, &opts.seed)
183        .into_iter()
184        .map(|u| (u, 0))
185        .collect();
186    let mut sitemaps_fetched = 0;
187
188    while let Some((sitemap_url, depth)) = sitemap_queue.pop_front() {
189        if sitemaps_fetched >= MAP_MAX_SITEMAPS || count >= opts.limit {
190            break;
191        }
192        if depth > MAP_MAX_INDEX_DEPTH || !is_same_site(&opts.seed, &sitemap_url) {
193            continue;
194        }
195
196        throttle(&mut last_fetch).await;
197        sitemaps_fetched += 1;
198
199        let body = {
200            let agent = agent.clone();
201            spawn_blocking({
202                let seed = opts.seed.clone();
203                move || fetch_sitemap(&agent, &sitemap_url, &seed)
204            })
205            .await
206            .ok()
207            .flatten()
208        };
209        let Some(body) = body else { continue };
210
211        for entry in parse_sitemap(&body) {
212            match entry {
213                SitemapEntry::Url { loc, lastmod } => {
214                    if count >= opts.limit {
215                        break;
216                    }
217                    if let Some(e) = validate_entry(&loc, lastmod, &opts.seed, &robots, opts, &mut visited) {
218                        on_url(&e);
219                        count += 1;
220                    }
221                }
222                SitemapEntry::Sitemap { loc } => {
223                    if let Ok(url) = Url::parse(&loc) {
224                        sitemap_queue.push_back((url, depth + 1));
225                    }
226                }
227            }
228        }
229    }
230
231    if count == 0 && !opts.no_fallback {
232        throttle(&mut last_fetch).await;
233        let html = {
234            let agent = agent.clone();
235            let seed = opts.seed.clone();
236            spawn_blocking(move || fetch_html(&agent, &seed)).await.ok().flatten()
237        };
238        if let Some(html) = html {
239            for link in extract_links(&html, &opts.seed) {
240                if count >= opts.limit {
241                    break;
242                }
243                if let Some(e) = validate_entry(link.as_str(), None, &opts.seed, &robots, opts, &mut visited) {
244                    on_url(&e);
245                    count += 1;
246                }
247            }
248        }
249    }
250}
251
252fn discover_sitemaps(robots: &RobotsPolicy, seed: &Url) -> Vec<Url> {
253    let mut urls = Vec::new();
254    if let RobotsPolicy::Rules(rules) = robots {
255        urls.extend(rules.sitemaps.iter().cloned());
256    }
257    if let Ok(default) = seed.join("/sitemap.xml") {
258        if !urls.contains(&default) {
259            urls.push(default);
260        }
261    }
262    urls
263}
264
265fn build_agent(ua: &str, timeout: Duration) -> ureq::Agent {
266    ureq::Agent::new_with_config(
267        ureq::config::Config::builder()
268            .max_redirects(0)
269            .http_status_as_error(false)
270            .timeout_global(Some(timeout))
271            .user_agent(ua)
272            .build(),
273    )
274}
275
276fn fetch_following_redirects(agent: &ureq::Agent, url: &Url, seed: &Url) -> Option<ureq::http::Response<ureq::Body>> {
277    let mut current = url.clone();
278    for _ in 0..MAP_MAX_REDIRECTS {
279        let resp = agent.get(current.as_str()).call().ok()?;
280        let status = resp.status().as_u16();
281        if matches!(status, 301 | 302 | 303 | 307 | 308) {
282            let location = resp.headers().get("location")?.to_str().ok()?;
283            let next = current.join(location).ok()?;
284            if net::validate_url_with_policy(next.as_str(), bridge::engine_policy()).is_err()
285                || !is_same_site(seed, &next)
286            {
287                return None;
288            }
289            current = next;
290            continue;
291        }
292        if status >= 400 {
293            return None;
294        }
295        return Some(resp);
296    }
297    None
298}
299
300fn fetch_sitemap(agent: &ureq::Agent, url: &Url, seed: &Url) -> Option<String> {
301    let resp = fetch_following_redirects(agent, url, seed)?;
302    let content_type = resp
303        .headers()
304        .get("content-type")
305        .and_then(|v| v.to_str().ok())
306        .unwrap_or("");
307
308    let is_gzip = url
309        .path()
310        .rsplit('/')
311        .next()
312        .and_then(|seg| std::path::Path::new(seg).extension())
313        .is_some_and(|ext| ext.eq_ignore_ascii_case("gz"))
314        || content_type.contains("gzip")
315        || resp
316            .headers()
317            .get("content-encoding")
318            .and_then(|v| v.to_str().ok())
319            .is_some_and(|v| v.contains("gzip"));
320
321    if is_gzip {
322        let bytes = resp
323            .into_body()
324            .with_config()
325            .limit(MAP_SITEMAP_MAX_BYTES)
326            .read_to_vec()
327            .ok()?;
328        let mut decoded = Vec::new();
329        flate2::read::GzDecoder::new(bytes.as_slice())
330            .take(MAP_SITEMAP_MAX_DECOMPRESSED)
331            .read_to_end(&mut decoded)
332            .ok()?;
333        if decoded.len() as u64 > bytes.len() as u64 * MAP_GZIP_MAX_RATIO {
334            return None;
335        }
336        if looks_like_html(&decoded) {
337            return None;
338        }
339        String::from_utf8(decoded).ok()
340    } else {
341        let body = resp
342            .into_body()
343            .with_config()
344            .limit(MAP_SITEMAP_MAX_BYTES)
345            .read_to_string()
346            .ok()?;
347        if looks_like_html(body.as_bytes()) {
348            return None;
349        }
350        Some(body)
351    }
352}
353
354fn looks_like_html(bytes: &[u8]) -> bool {
355    const DOCTYPE: &[u8] = b"<!doctype";
356    const HTML: &[u8] = b"<html";
357    const BOM: &[u8] = b"\xef\xbb\xbf";
358    let mut prefix = bytes.get(..HTML_SNIFF_LEN).unwrap_or(bytes);
359    if prefix.starts_with(BOM) {
360        prefix = &prefix[BOM.len()..];
361    }
362    let prefix = prefix
363        .iter()
364        .position(|b| !b.is_ascii_whitespace())
365        .map_or(&[][..], |i| &prefix[i..]);
366    prefix
367        .get(..DOCTYPE.len())
368        .is_some_and(|p| p.eq_ignore_ascii_case(DOCTYPE))
369        || prefix.get(..HTML.len()).is_some_and(|p| p.eq_ignore_ascii_case(HTML))
370}
371
372fn fetch_html(agent: &ureq::Agent, url: &Url) -> Option<String> {
373    let resp = fetch_following_redirects(agent, url, url)?;
374    resp.into_body()
375        .with_config()
376        .limit(MAP_HTML_MAX_BYTES)
377        .read_to_string()
378        .ok()
379}
380
381fn extract_links(html: &str, base: &Url) -> Vec<Url> {
382    dom_query::Document::from(html)
383        .select("a[href]")
384        .iter()
385        .filter_map(|el| {
386            let href = el.attr("href")?;
387            let href = href.trim();
388            if href.is_empty() {
389                return None;
390            }
391            let resolved = base.join(href).ok()?;
392            matches!(resolved.scheme(), "http" | "https").then_some(resolved)
393        })
394        .collect()
395}
396
397enum SitemapEntry {
398    Url { loc: String, lastmod: Option<String> },
399    Sitemap { loc: String },
400}
401
402fn parse_sitemap(body: &str) -> Vec<SitemapEntry> {
403    use quick_xml::events::Event;
404    use quick_xml::reader::Reader;
405
406    let mut reader = Reader::from_str(body);
407    let mut entries = Vec::new();
408    let mut buf = Vec::new();
409    let mut capture = Capture::Idle;
410    let mut loc = String::new();
411    let mut lastmod = String::new();
412    let mut in_url = false;
413    let mut in_sitemap = false;
414    let mut depth: u32 = 0;
415
416    loop {
417        match reader.read_event_into(&mut buf) {
418            Ok(Event::Start(e)) => {
419                let name = e.local_name();
420                match name.as_ref() {
421                    b"url" => {
422                        in_url = true;
423                        depth = 0;
424                    }
425                    b"sitemap" => {
426                        in_sitemap = true;
427                        depth = 0;
428                    }
429                    b"loc" if (in_url || in_sitemap) && depth == 0 => capture = Capture::Loc,
430                    b"lastmod" if in_url && depth == 0 => capture = Capture::Lastmod,
431                    _ if in_url || in_sitemap => depth += 1,
432                    _ => {}
433                }
434            }
435            Ok(Event::Text(e)) => {
436                if let Ok(text) = e.xml10_content() {
437                    match capture {
438                        Capture::Loc => loc.push_str(text.trim()),
439                        Capture::Lastmod => lastmod.push_str(text.trim()),
440                        Capture::Idle => {}
441                    }
442                } else {
443                    loc.clear();
444                    lastmod.clear();
445                    capture = Capture::Idle;
446                }
447            }
448            Ok(Event::GeneralRef(e)) => {
449                let resolved = match &*e {
450                    b"amp" => "&",
451                    b"lt" => "<",
452                    b"gt" => ">",
453                    b"quot" => "\"",
454                    b"apos" => "'",
455                    _ => "",
456                };
457                match capture {
458                    Capture::Loc => loc.push_str(resolved),
459                    Capture::Lastmod => lastmod.push_str(resolved),
460                    Capture::Idle => {}
461                }
462            }
463            Ok(Event::End(e)) => {
464                let name = e.local_name();
465                match name.as_ref() {
466                    b"url" if in_url => {
467                        if !loc.is_empty() {
468                            let lm = if lastmod.is_empty() {
469                                None
470                            } else {
471                                Some(std::mem::take(&mut lastmod))
472                            };
473                            entries.push(SitemapEntry::Url {
474                                loc: std::mem::take(&mut loc),
475                                lastmod: lm,
476                            });
477                        }
478                        loc.clear();
479                        lastmod.clear();
480                        in_url = false;
481                    }
482                    b"sitemap" if in_sitemap => {
483                        if !loc.is_empty() {
484                            entries.push(SitemapEntry::Sitemap {
485                                loc: std::mem::take(&mut loc),
486                            });
487                        }
488                        loc.clear();
489                        lastmod.clear();
490                        in_sitemap = false;
491                    }
492                    b"loc" | b"lastmod" if capture != Capture::Idle => capture = Capture::Idle,
493                    _ if depth > 0 => depth -= 1,
494                    _ => {}
495                }
496            }
497            Ok(Event::Eof) | Err(_) => break,
498            _ => {}
499        }
500        buf.clear();
501    }
502
503    entries
504}
505
506#[derive(Clone, Copy, Debug, PartialEq, Eq)]
507enum Capture {
508    Idle,
509    Loc,
510    Lastmod,
511}
512
513fn validate_entry(
514    loc: &str,
515    lastmod: Option<String>,
516    seed: &Url,
517    robots: &RobotsPolicy,
518    opts: &MapConfig,
519    visited: &mut HashSet<String>,
520) -> Option<MapEntry> {
521    if loc.len() > MAP_URL_MAX_LEN {
522        return None;
523    }
524    let url = Url::parse(loc)
525        .ok()
526        .filter(|u| matches!(u.scheme(), "http" | "https"))?;
527    if !is_same_site(seed, &url) {
528        return None;
529    }
530    if !robots.is_allowed(&url) {
531        return None;
532    }
533    if !matches_scope(&url, opts.include.as_ref(), opts.exclude.as_ref()) {
534        return None;
535    }
536    let normalized = normalize_url(&url);
537    if !visited.insert(normalized.clone()) {
538        return None;
539    }
540    Some(MapEntry {
541        url: normalized,
542        lastmod,
543    })
544}
545
546async fn throttle(last_fetch: &mut Instant) {
547    let elapsed = last_fetch.elapsed();
548    if elapsed < MAP_MIN_FETCH_INTERVAL {
549        tokio::time::sleep(MAP_MIN_FETCH_INTERVAL.saturating_sub(elapsed)).await;
550    }
551    *last_fetch = Instant::now();
552}
553
554#[cfg(test)]
555mod tests {
556    use super::*;
557
558    fn test_config(seed: &str) -> MapConfig {
559        MapConfig {
560            seed: Url::parse(seed).unwrap(),
561            limit: 100,
562            include: None,
563            exclude: None,
564            user_agent: None,
565            timeout: Duration::from_secs(30),
566            no_fallback: false,
567        }
568    }
569
570    #[test]
571    fn parse_urlset() {
572        let xml = r#"<?xml version="1.0"?>
573<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
574  <url><loc>https://example.com/a</loc><lastmod>2026-01-01</lastmod></url>
575  <url><loc>https://example.com/b</loc></url>
576</urlset>"#;
577        let entries = parse_sitemap(xml);
578        assert_eq!(entries.len(), 2);
579        match &entries[0] {
580            SitemapEntry::Url { loc, lastmod } => {
581                assert_eq!(loc, "https://example.com/a");
582                assert_eq!(lastmod.as_deref(), Some("2026-01-01"));
583            }
584            SitemapEntry::Sitemap { .. } => panic!("expected Url"),
585        }
586        match &entries[1] {
587            SitemapEntry::Url { loc, lastmod } => {
588                assert_eq!(loc, "https://example.com/b");
589                assert!(lastmod.is_none());
590            }
591            SitemapEntry::Sitemap { .. } => panic!("expected Url"),
592        }
593    }
594
595    #[test]
596    fn parse_sitemapindex() {
597        let xml = r#"<?xml version="1.0"?>
598<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
599  <sitemap><loc>https://example.com/sitemap1.xml</loc></sitemap>
600  <sitemap><loc>https://example.com/sitemap2.xml</loc></sitemap>
601</sitemapindex>"#;
602        let entries = parse_sitemap(xml);
603        assert_eq!(entries.len(), 2);
604        match &entries[0] {
605            SitemapEntry::Sitemap { loc } => assert_eq!(loc, "https://example.com/sitemap1.xml"),
606            SitemapEntry::Url { .. } => panic!("expected Sitemap"),
607        }
608    }
609
610    #[test]
611    fn parse_handles_xml_entities() {
612        let xml = r"<urlset><url><loc>https://example.com/a?b=1&amp;c=2</loc></url></urlset>";
613        let entries = parse_sitemap(xml);
614        assert_eq!(entries.len(), 1);
615        match &entries[0] {
616            SitemapEntry::Url { loc, .. } => assert_eq!(loc, "https://example.com/a?b=1&c=2"),
617            SitemapEntry::Sitemap { .. } => panic!("expected Url"),
618        }
619    }
620
621    #[test]
622    fn parse_handles_namespaced_tags() {
623        let xml = r#"<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
624                xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">
625  <url>
626    <loc>https://example.com/page</loc>
627    <image:image><image:loc>https://example.com/img.png</image:loc></image:image>
628  </url>
629</urlset>"#;
630        let entries = parse_sitemap(xml);
631        assert_eq!(entries.len(), 1);
632        match &entries[0] {
633            SitemapEntry::Url { loc, .. } => assert_eq!(loc, "https://example.com/page"),
634            SitemapEntry::Sitemap { .. } => panic!("expected Url"),
635        }
636    }
637
638    #[test]
639    fn parse_loc_after_nested_extension() {
640        let xml = r#"<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
641                xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">
642  <url>
643    <image:image><image:loc>https://example.com/img.png</image:loc></image:image>
644    <loc>https://example.com/page</loc>
645    <lastmod>2026-01-01</lastmod>
646  </url>
647</urlset>"#;
648        let entries = parse_sitemap(xml);
649        assert_eq!(entries.len(), 1);
650        match &entries[0] {
651            SitemapEntry::Url { loc, lastmod } => {
652                assert_eq!(loc, "https://example.com/page");
653                assert_eq!(lastmod.as_deref(), Some("2026-01-01"));
654            }
655            SitemapEntry::Sitemap { .. } => panic!("expected Url"),
656        }
657    }
658
659    #[test]
660    fn parse_empty_body_returns_empty() {
661        assert!(parse_sitemap("").is_empty());
662        assert!(parse_sitemap("<html><body>Not Found</body></html>").is_empty());
663    }
664
665    #[test]
666    fn looks_like_html_detects_variants() {
667        assert!(looks_like_html(b"<!DOCTYPE html>"));
668        assert!(looks_like_html(b"<!doctype html>"));
669        assert!(looks_like_html(b"<html lang=\"en\">"));
670        assert!(looks_like_html(b"<HTML>"));
671        assert!(looks_like_html(b"\xef\xbb\xbf<!DOCTYPE html>"));
672        assert!(looks_like_html(b"  \n<!doctype html>"));
673        assert!(looks_like_html(b"\xef\xbb\xbf  <html>"));
674        assert!(!looks_like_html(b"<?xml version=\"1.0\"?>"));
675        assert!(!looks_like_html(b"<urlset>"));
676        assert!(!looks_like_html(b""));
677    }
678
679    #[test]
680    fn validate_entry_rejects_private_ip() {
681        let opts = test_config("https://example.com");
682
683        let mut visited = HashSet::new();
684        let result = validate_entry(
685            "http://127.0.0.1/secret",
686            None,
687            &opts.seed,
688            &RobotsPolicy::Unavailable,
689            &opts,
690            &mut visited,
691        );
692        assert!(result.is_none());
693    }
694
695    #[test]
696    fn validate_entry_rejects_cross_site() {
697        let opts = test_config("https://example.com");
698        let robots = RobotsPolicy::Unavailable;
699        let mut visited = HashSet::new();
700        let result = validate_entry("https://evil.com/page", None, &opts.seed, &robots, &opts, &mut visited);
701        assert!(result.is_none());
702    }
703
704    #[test]
705    fn validate_entry_deduplicates() {
706        let opts = test_config("https://example.com");
707        let robots = RobotsPolicy::Unavailable;
708        let mut visited = HashSet::new();
709        let first = validate_entry(
710            "https://example.com/page",
711            None,
712            &opts.seed,
713            &robots,
714            &opts,
715            &mut visited,
716        );
717        assert!(first.is_some());
718        let second = validate_entry(
719            "https://example.com/page",
720            None,
721            &opts.seed,
722            &robots,
723            &opts,
724            &mut visited,
725        );
726        assert!(second.is_none());
727    }
728
729    #[test]
730    fn validate_entry_rejects_long_url() {
731        let opts = test_config("https://example.com");
732        let robots = RobotsPolicy::Unavailable;
733        let mut visited = HashSet::new();
734        let long_url = format!("https://example.com/{}", "a".repeat(MAP_URL_MAX_LEN));
735        let result = validate_entry(&long_url, None, &opts.seed, &robots, &opts, &mut visited);
736        assert!(result.is_none());
737    }
738
739    #[test]
740    fn discover_sitemaps_includes_robots_and_default() {
741        let seed = Url::parse("https://example.com").unwrap();
742        let robots = RobotsPolicy::Rules(RobotsRules {
743            rules: Vec::new(),
744            sitemaps: vec![Url::parse("https://example.com/custom-sitemap.xml").unwrap()],
745        });
746        let sitemaps = discover_sitemaps(&robots, &seed);
747        assert_eq!(sitemaps.len(), 2);
748        assert_eq!(sitemaps[0].as_str(), "https://example.com/custom-sitemap.xml");
749        assert_eq!(sitemaps[1].as_str(), "https://example.com/sitemap.xml");
750    }
751
752    #[test]
753    fn discover_sitemaps_deduplicates_default() {
754        let seed = Url::parse("https://example.com").unwrap();
755        let robots = RobotsPolicy::Rules(RobotsRules {
756            rules: Vec::new(),
757            sitemaps: vec![Url::parse("https://example.com/sitemap.xml").unwrap()],
758        });
759        let sitemaps = discover_sitemaps(&robots, &seed);
760        assert_eq!(sitemaps.len(), 1);
761    }
762
763    mod integration {
764        use std::time::Duration;
765
766        use tokio::task::spawn_blocking;
767        use url::Url;
768        use wiremock::matchers::{method, path};
769        use wiremock::{Mock, MockServer, ResponseTemplate};
770
771        use crate::map::{
772            MapConfig, MapEntry, build_agent, extract_links, fetch_html, fetch_sitemap, parse_sitemap, run,
773        };
774
775        #[tokio::test]
776        async fn fetch_sitemap_parses_urlset() {
777            let server = MockServer::start().await;
778            let xml = r#"<?xml version="1.0"?><urlset><url><loc>https://example.com/a</loc></url></urlset>"#;
779            Mock::given(method("GET"))
780                .and(path("/sitemap.xml"))
781                .respond_with(ResponseTemplate::new(200).set_body_raw(xml.as_bytes().to_vec(), "application/xml"))
782                .mount(&server)
783                .await;
784
785            let agent = build_agent("test/1.0", Duration::from_secs(5));
786            let url = Url::parse(&format!("{}/sitemap.xml", server.uri())).unwrap();
787            let body = spawn_blocking(move || fetch_sitemap(&agent, &url, &url)).await.unwrap();
788
789            let entries = parse_sitemap(&body.unwrap());
790            assert_eq!(entries.len(), 1);
791        }
792
793        #[tokio::test]
794        async fn fetch_sitemap_rejects_html_error_page() {
795            let server = MockServer::start().await;
796            Mock::given(method("GET"))
797                .and(path("/sitemap.xml"))
798                .respond_with(ResponseTemplate::new(200).set_body_raw(
799                    b"<!DOCTYPE html><html><body>Not Found</body></html>".to_vec(),
800                    "text/html; charset=utf-8",
801                ))
802                .mount(&server)
803                .await;
804
805            let agent = build_agent("test/1.0", Duration::from_secs(5));
806            let url = Url::parse(&format!("{}/sitemap.xml", server.uri())).unwrap();
807            let body = spawn_blocking(move || fetch_sitemap(&agent, &url, &url)).await.unwrap();
808
809            assert!(body.is_none());
810        }
811
812        #[tokio::test]
813        async fn fetch_sitemap_returns_none_on_404() {
814            let server = MockServer::start().await;
815            Mock::given(method("GET"))
816                .and(path("/sitemap.xml"))
817                .respond_with(ResponseTemplate::new(404))
818                .mount(&server)
819                .await;
820
821            let agent = build_agent("test/1.0", Duration::from_secs(5));
822            let url = Url::parse(&format!("{}/sitemap.xml", server.uri())).unwrap();
823            let body = spawn_blocking(move || fetch_sitemap(&agent, &url, &url)).await.unwrap();
824
825            assert!(body.is_none());
826        }
827
828        #[tokio::test]
829        async fn fetch_sitemap_handles_gzip() {
830            use std::io::Write as _;
831
832            use flate2::Compression;
833            use flate2::write::GzEncoder;
834
835            let server = MockServer::start().await;
836            let xml = r#"<?xml version="1.0"?><urlset><url><loc>https://example.com/gz</loc></url></urlset>"#;
837            let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
838            encoder.write_all(xml.as_bytes()).unwrap();
839            let compressed = encoder.finish().unwrap();
840
841            Mock::given(method("GET"))
842                .and(path("/sitemap.xml.gz"))
843                .respond_with(ResponseTemplate::new(200).set_body_raw(compressed, "application/gzip"))
844                .mount(&server)
845                .await;
846
847            let agent = build_agent("test/1.0", Duration::from_secs(5));
848            let url = Url::parse(&format!("{}/sitemap.xml.gz", server.uri())).unwrap();
849            let body = spawn_blocking(move || fetch_sitemap(&agent, &url, &url)).await.unwrap();
850
851            let entries = parse_sitemap(&body.unwrap());
852            assert_eq!(entries.len(), 1);
853        }
854
855        #[tokio::test]
856        async fn fetch_html_extracts_links() {
857            let server = MockServer::start().await;
858            Mock::given(method("GET"))
859                .and(path("/"))
860                .respond_with(ResponseTemplate::new(200).set_body_raw(
861                    br#"<html><body><a href="/link">x</a></body></html>"#.to_vec(),
862                    "text/html; charset=utf-8",
863                ))
864                .mount(&server)
865                .await;
866
867            let agent = build_agent("test/1.0", Duration::from_secs(5));
868            let seed = Url::parse(&server.uri()).unwrap();
869            let html = spawn_blocking({
870                let seed = seed.clone();
871                move || fetch_html(&agent, &seed)
872            })
873            .await
874            .unwrap()
875            .unwrap();
876
877            let links = extract_links(&html, &seed);
878            assert_eq!(links.len(), 1);
879        }
880
881        async fn check_run(server: &MockServer, configure: impl FnOnce(&mut MapConfig)) -> Vec<MapEntry> {
882            let mut config = MapConfig {
883                seed: Url::parse(&server.uri()).unwrap(),
884                limit: 100,
885                include: None,
886                exclude: None,
887                user_agent: Some("test-bot".into()),
888                timeout: Duration::from_secs(5),
889                no_fallback: false,
890            };
891            configure(&mut config);
892            let mut entries = Vec::new();
893            run(&config, |e| {
894                entries.push(MapEntry {
895                    url: e.url.clone(),
896                    lastmod: e.lastmod.clone(),
897                });
898            })
899            .await;
900            entries
901        }
902
903        #[tokio::test]
904        async fn run_discovers_urls_from_sitemap() {
905            let server = MockServer::start().await;
906            Mock::given(method("GET"))
907                .and(path("/robots.txt"))
908                .respond_with(ResponseTemplate::new(200).set_body_string("User-agent: *\nAllow: /"))
909                .mount(&server)
910                .await;
911            let sitemap = format!(
912                "<urlset><url><loc>{}/page1</loc></url><url><loc>{}/page2</loc></url></urlset>",
913                server.uri(),
914                server.uri()
915            );
916            Mock::given(method("GET"))
917                .and(path("/sitemap.xml"))
918                .respond_with(ResponseTemplate::new(200).set_body_string(sitemap))
919                .mount(&server)
920                .await;
921
922            let entries = check_run(&server, |_| {}).await;
923            assert_eq!(entries.len(), 2);
924            assert!(entries.iter().any(|e| e.url.ends_with("/page1")));
925            assert!(entries.iter().any(|e| e.url.ends_with("/page2")));
926        }
927
928        #[tokio::test]
929        async fn run_respects_limit() {
930            let server = MockServer::start().await;
931            Mock::given(method("GET"))
932                .and(path("/robots.txt"))
933                .respond_with(ResponseTemplate::new(404))
934                .mount(&server)
935                .await;
936            let sitemap = format!(
937                "<urlset><url><loc>{}/a</loc></url><url><loc>{}/b</loc></url><url><loc>{}/c</loc></url></urlset>",
938                server.uri(),
939                server.uri(),
940                server.uri()
941            );
942            Mock::given(method("GET"))
943                .and(path("/sitemap.xml"))
944                .respond_with(ResponseTemplate::new(200).set_body_string(sitemap))
945                .mount(&server)
946                .await;
947
948            let entries = check_run(&server, |c| c.limit = 2).await;
949            assert_eq!(entries.len(), 2);
950        }
951
952        #[tokio::test]
953        async fn run_follows_sitemap_index() {
954            let server = MockServer::start().await;
955            Mock::given(method("GET"))
956                .and(path("/robots.txt"))
957                .respond_with(ResponseTemplate::new(404))
958                .mount(&server)
959                .await;
960            let index = format!(
961                "<sitemapindex><sitemap><loc>{}/sub.xml</loc></sitemap></sitemapindex>",
962                server.uri()
963            );
964            Mock::given(method("GET"))
965                .and(path("/sitemap.xml"))
966                .respond_with(ResponseTemplate::new(200).set_body_string(index))
967                .mount(&server)
968                .await;
969            let sub = format!("<urlset><url><loc>{}/deep</loc></url></urlset>", server.uri());
970            Mock::given(method("GET"))
971                .and(path("/sub.xml"))
972                .respond_with(ResponseTemplate::new(200).set_body_string(sub))
973                .mount(&server)
974                .await;
975
976            let entries = check_run(&server, |_| {}).await;
977            assert_eq!(entries.len(), 1);
978            assert!(entries[0].url.ends_with("/deep"));
979        }
980
981        #[tokio::test]
982        async fn run_falls_back_to_html_links() {
983            let server = MockServer::start().await;
984            Mock::given(method("GET"))
985                .and(path("/robots.txt"))
986                .respond_with(ResponseTemplate::new(404))
987                .mount(&server)
988                .await;
989            Mock::given(method("GET"))
990                .and(path("/sitemap.xml"))
991                .respond_with(ResponseTemplate::new(404))
992                .mount(&server)
993                .await;
994            let html = format!(
995                r#"<html><body><a href="{}/link1">L1</a><a href="{}/link2">L2</a></body></html>"#,
996                server.uri(),
997                server.uri()
998            );
999            Mock::given(method("GET"))
1000                .and(path("/"))
1001                .respond_with(ResponseTemplate::new(200).set_body_string(html))
1002                .mount(&server)
1003                .await;
1004
1005            let entries = check_run(&server, |_| {}).await;
1006            assert_eq!(entries.len(), 2);
1007        }
1008
1009        #[tokio::test]
1010        async fn run_no_fallback_skips_html() {
1011            let server = MockServer::start().await;
1012            Mock::given(method("GET"))
1013                .and(path("/robots.txt"))
1014                .respond_with(ResponseTemplate::new(404))
1015                .mount(&server)
1016                .await;
1017            Mock::given(method("GET"))
1018                .and(path("/sitemap.xml"))
1019                .respond_with(ResponseTemplate::new(404))
1020                .mount(&server)
1021                .await;
1022            Mock::given(method("GET"))
1023                .and(path("/"))
1024                .respond_with(
1025                    ResponseTemplate::new(200).set_body_string(r#"<html><body><a href="/link">L</a></body></html>"#),
1026                )
1027                .mount(&server)
1028                .await;
1029
1030            let entries = check_run(&server, |c| c.no_fallback = true).await;
1031            assert_eq!(entries.len(), 0);
1032        }
1033
1034        #[tokio::test]
1035        async fn run_deduplicates_urls() {
1036            let server = MockServer::start().await;
1037            Mock::given(method("GET"))
1038                .and(path("/robots.txt"))
1039                .respond_with(ResponseTemplate::new(404))
1040                .mount(&server)
1041                .await;
1042            let sitemap = format!(
1043                "<urlset><url><loc>{}/dup</loc></url><url><loc>{}/dup</loc></url><url><loc>{}/unique</loc></url></urlset>",
1044                server.uri(),
1045                server.uri(),
1046                server.uri()
1047            );
1048            Mock::given(method("GET"))
1049                .and(path("/sitemap.xml"))
1050                .respond_with(ResponseTemplate::new(200).set_body_string(sitemap))
1051                .mount(&server)
1052                .await;
1053
1054            let entries = check_run(&server, |_| {}).await;
1055            assert_eq!(entries.len(), 2);
1056        }
1057    }
1058}