servo_fetch/
crawl.rs

1//! Site crawling — BFS link traversal with scope, robots.txt, and rate limiting.
2
3use std::collections::{HashSet, VecDeque};
4use std::fmt;
5use std::hash::{DefaultHasher, Hash, Hasher};
6use std::time::{Duration, SystemTime};
7
8use tokio::task::{JoinSet, spawn_blocking};
9use tokio::time::{MissedTickBehavior, interval};
10use url::Url;
11
12use crate::bridge::{self, PageFetcher};
13use crate::net;
14use crate::robots::RobotsPolicy;
15use crate::scope::{is_same_site, matches_scope, normalize_url};
16
17const MAX_HTML_BYTES: usize = 2 * 1024 * 1024;
18
19/// Options for crawling a site.
20#[must_use = "options do nothing until passed to crawl() or crawl_each()"]
21#[derive(Debug, Clone)]
22pub struct CrawlOptions {
23    pub(crate) url: String,
24    pub(crate) limit: usize,
25    pub(crate) max_depth: usize,
26    pub(crate) timeout: Duration,
27    pub(crate) settle: Duration,
28    pub(crate) include: Vec<String>,
29    pub(crate) exclude: Vec<String>,
30    pub(crate) selector: Option<String>,
31    pub(crate) json: bool,
32    pub(crate) user_agent: Option<String>,
33    pub(crate) concurrency: usize,
34    pub(crate) delay: Option<Duration>,
35}
36
37impl CrawlOptions {
38    /// Create crawl options for the given seed URL.
39    pub fn new(url: &str) -> Self {
40        Self {
41            url: url.into(),
42            limit: 50,
43            max_depth: 3,
44            timeout: Duration::from_secs(30),
45            settle: Duration::ZERO,
46            include: Vec::new(),
47            exclude: Vec::new(),
48            selector: None,
49            json: false,
50            user_agent: None,
51            concurrency: 1,
52            delay: Some(Duration::from_millis(500)),
53        }
54    }
55
56    /// Maximum number of pages to crawl (default: 50).
57    pub fn limit(mut self, n: usize) -> Self {
58        self.limit = n;
59        self
60    }
61
62    /// Maximum link depth from the seed URL (default: 3).
63    pub fn max_depth(mut self, n: usize) -> Self {
64        self.max_depth = n;
65        self
66    }
67
68    /// Page load timeout per page (default: 30s).
69    pub fn timeout(mut self, timeout: Duration) -> Self {
70        self.timeout = timeout;
71        self
72    }
73
74    /// Extra wait after load event per page (default: 0).
75    pub fn settle(mut self, settle: Duration) -> Self {
76        self.settle = settle;
77        self
78    }
79
80    /// URL path glob patterns to include (e.g. `"/docs/**"`).
81    pub fn include(mut self, patterns: &[&str]) -> Self {
82        self.include = patterns.iter().map(|s| (*s).to_string()).collect();
83        self
84    }
85
86    /// URL path glob patterns to exclude (e.g. `"/docs/archive/**"`).
87    pub fn exclude(mut self, patterns: &[&str]) -> Self {
88        self.exclude = patterns.iter().map(|s| (*s).to_string()).collect();
89        self
90    }
91
92    /// Output crawled content as JSON instead of Markdown.
93    pub fn json(mut self, json: bool) -> Self {
94        self.json = json;
95        self
96    }
97
98    /// CSS selector to extract a specific section per page.
99    pub fn selector(mut self, selector: impl Into<String>) -> Self {
100        self.selector = Some(selector.into());
101        self
102    }
103
104    /// Override the User-Agent string for all pages in this crawl.
105    pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
106        self.user_agent = Some(net::sanitize_user_agent(ua.into()));
107        self
108    }
109
110    /// Maximum parallel fetches (default: 1). Values below 1 are clamped to 1.
111    /// Results are yielded in completion order when greater than 1.
112    pub fn concurrency(mut self, n: usize) -> Self {
113        self.concurrency = n.max(1);
114        self
115    }
116
117    /// Minimum dispatch interval (default: `Some(500ms)`). `None` disables rate limiting.
118    pub fn delay(mut self, delay: Option<Duration>) -> Self {
119        self.delay = delay;
120        self
121    }
122}
123
124/// Result for a single crawled page.
125#[derive(Debug, Clone)]
126#[non_exhaustive]
127pub struct CrawlResult {
128    /// URL of the crawled page.
129    pub url: String,
130    /// Link depth from the seed URL.
131    pub depth: usize,
132    /// Wall-clock time when the fetch completed.
133    pub fetched_at: SystemTime,
134    /// Page content if successful, or error if failed.
135    pub outcome: Result<CrawlPage, CrawlError>,
136}
137
138/// Successfully crawled page.
139#[derive(Debug, Clone)]
140pub struct CrawlPage {
141    /// Page title.
142    pub title: Option<String>,
143    /// Extracted content (Markdown or JSON depending on options).
144    pub content: String,
145    /// Number of links discovered on this page.
146    pub links_found: usize,
147}
148
149/// Error from a failed crawl attempt.
150#[derive(Debug, Clone)]
151pub struct CrawlError {
152    /// Error message.
153    pub message: String,
154}
155
156impl fmt::Display for CrawlError {
157    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
158        f.write_str(&self.message)
159    }
160}
161
162impl std::error::Error for CrawlError {}
163
164impl serde::Serialize for CrawlResult {
165    fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
166        use serde::ser::SerializeMap;
167        let fetched_at = humantime::format_rfc3339_millis(self.fetched_at).to_string();
168        match &self.outcome {
169            Ok(page) => {
170                let mut map = serializer.serialize_map(None)?;
171                map.serialize_entry("type", "page")?;
172                map.serialize_entry("url", &self.url)?;
173                map.serialize_entry("depth", &self.depth)?;
174                map.serialize_entry("fetched_at", &fetched_at)?;
175                if let Some(t) = &page.title {
176                    map.serialize_entry("title", t)?;
177                }
178                map.serialize_entry("content", &page.content)?;
179                map.serialize_entry("links_found", &page.links_found)?;
180                map.end()
181            }
182            Err(e) => {
183                let mut map = serializer.serialize_map(None)?;
184                map.serialize_entry("type", "error")?;
185                map.serialize_entry("url", &self.url)?;
186                map.serialize_entry("depth", &self.depth)?;
187                map.serialize_entry("fetched_at", &fetched_at)?;
188                map.serialize_entry("error", &e.message)?;
189                map.end()
190            }
191        }
192    }
193}
194
195impl CrawlResult {
196    fn from_internal(r: &CrawlPageResult) -> Self {
197        let outcome = match r.status {
198            CrawlStatus::Ok => Ok(CrawlPage {
199                title: r.title.clone(),
200                content: r.content.clone().unwrap_or_default(),
201                links_found: r.links_found,
202            }),
203            CrawlStatus::Error => Err(CrawlError {
204                message: r.error.clone().unwrap_or_default(),
205            }),
206        };
207        Self {
208            url: r.url.clone(),
209            depth: r.depth,
210            fetched_at: r.fetched_at,
211            outcome,
212        }
213    }
214}
215
216/// Crawl a site, invoking `on_page` for each result as it arrives.
217#[allow(clippy::needless_pass_by_value)]
218pub fn crawl_each(opts: CrawlOptions, mut on_page: impl FnMut(&CrawlResult)) -> crate::error::Result<()> {
219    net::ensure_crypto_provider();
220    let plan = build_crawl_plan(&opts)?;
221    crate::runtime::block_on(async {
222        let robots = spawn_blocking({
223            let seed = plan.seed.clone();
224            let user_agent = plan.user_agent.clone();
225            let timeout = Duration::from_secs(plan.timeout_secs);
226            move || crate::robots::RobotsRules::fetch(&seed, user_agent.as_deref(), timeout)
227        })
228        .await
229        .unwrap_or(RobotsPolicy::Unreachable);
230        run(plan, robots, &bridge::ServoFetcher, |r| {
231            on_page(&CrawlResult::from_internal(r));
232        })
233        .await
234    })
235    .map_err(|e| crate::error::Error::Engine(e.to_string()))?;
236    Ok(())
237}
238
239/// Crawl a site and collect all results.
240#[allow(clippy::needless_pass_by_value)]
241pub fn crawl(opts: CrawlOptions) -> crate::error::Result<Vec<CrawlResult>> {
242    let mut results = Vec::new();
243    crawl_each(opts, |r| results.push(r.clone()))?;
244    Ok(results)
245}
246
247fn build_crawl_plan(opts: &CrawlOptions) -> crate::error::Result<CrawlPlan> {
248    let seed = net::validate_url(&opts.url)?;
249    let include = if opts.include.is_empty() {
250        None
251    } else {
252        Some(crate::scope::build_globset(&opts.include)?)
253    };
254    let exclude = if opts.exclude.is_empty() {
255        None
256    } else {
257        Some(crate::scope::build_globset(&opts.exclude)?)
258    };
259    Ok(CrawlPlan {
260        seed,
261        limit: opts.limit,
262        max_depth: opts.max_depth,
263        timeout_secs: opts.timeout.as_secs().max(1),
264        settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
265        include,
266        exclude,
267        selector: opts.selector.clone(),
268        json: opts.json,
269        user_agent: opts.user_agent.clone(),
270        concurrency: opts.concurrency,
271        delay: opts.delay,
272    })
273}
274
275/// Crawl configuration.
276pub(crate) struct CrawlPlan {
277    pub seed: Url,
278    pub limit: usize,
279    pub max_depth: usize,
280    pub timeout_secs: u64,
281    pub settle_ms: u64,
282    pub include: Option<globset::GlobSet>,
283    pub exclude: Option<globset::GlobSet>,
284    pub selector: Option<String>,
285    pub json: bool,
286    pub user_agent: Option<String>,
287    /// Parallel fetch limit (clamped to >=1; yields in completion order when >1).
288    pub concurrency: usize,
289    /// Dispatch interval; `None` disables rate limiting.
290    pub delay: Option<Duration>,
291}
292
293/// Result for a single crawled page.
294pub(crate) struct CrawlPageResult {
295    pub url: String,
296    pub depth: usize,
297    pub status: CrawlStatus,
298    pub title: Option<String>,
299    pub content: Option<String>,
300    pub error: Option<String>,
301    pub links_found: usize,
302    pub fetched_at: SystemTime,
303}
304
305/// Status of a crawled page.
306pub(crate) enum CrawlStatus {
307    Ok,
308    Error,
309}
310
311struct Frontier {
312    queue: VecDeque<(Url, usize)>,
313    visited: HashSet<String>,
314    content_hashes: HashSet<u64>,
315}
316
317impl Frontier {
318    fn new(seed: &Url) -> Self {
319        Self {
320            queue: VecDeque::from([(seed.clone(), 0)]),
321            visited: HashSet::from([normalize_url(seed)]),
322            content_hashes: HashSet::new(),
323        }
324    }
325
326    fn try_enqueue(&mut self, url: Url, depth: usize) -> bool {
327        if self.visited.insert(normalize_url(&url)) {
328            self.queue.push_back((url, depth));
329            true
330        } else {
331            false
332        }
333    }
334
335    fn pop(&mut self) -> Option<(Url, usize)> {
336        self.queue.pop_front()
337    }
338
339    fn is_duplicate_content(&mut self, content: &str) -> bool {
340        let mut h = DefaultHasher::new();
341        content.hash(&mut h);
342        !self.content_hashes.insert(h.finish())
343    }
344
345    fn pending(&self) -> usize {
346        self.queue.len()
347    }
348}
349
350fn extract_links_from_html(html: &str, base: &Url) -> Vec<Url> {
351    dom_query::Document::from(html)
352        .select("a[href]")
353        .iter()
354        .filter_map(|el| {
355            let href = el.attr("href")?;
356            let href = href.trim();
357            if href.is_empty() {
358                return None;
359            }
360            let resolved = base.join(href).ok()?;
361            matches!(resolved.scheme(), "http" | "https").then_some(resolved)
362        })
363        .collect()
364}
365
366pub(crate) async fn run(
367    opts: CrawlPlan,
368    robots: RobotsPolicy,
369    fetcher: &(impl PageFetcher + Clone),
370    mut on_page: impl FnMut(&CrawlPageResult),
371) -> Vec<CrawlPageResult> {
372    let mut frontier = Frontier::new(&opts.seed);
373    let mut results = Vec::new();
374    let mut in_flight: JoinSet<FetchOutcome> = JoinSet::new();
375
376    // `Delay` keeps the steady-state rate correct after fetches exceed `delay`.
377    let mut ticker = opts.delay.map(|period| {
378        let mut t = interval(period);
379        t.set_missed_tick_behavior(MissedTickBehavior::Delay);
380        t
381    });
382
383    let concurrency = opts.concurrency.max(1);
384
385    loop {
386        while in_flight.len() < concurrency && results.len() + in_flight.len() < opts.limit {
387            let Some((url, depth)) = frontier.pop() else {
388                break;
389            };
390            if let Some(t) = ticker.as_mut() {
391                t.tick().await;
392            }
393            spawn_fetch(&mut in_flight, fetcher, &opts, url, depth);
394        }
395
396        let outcome = match in_flight.join_next().await {
397            None => break,
398            Some(Ok(o)) => o,
399            Some(Err(e)) if e.is_panic() => {
400                tracing::error!(err = %e, "crawl fetch task panicked");
401                continue;
402            }
403            Some(Err(e)) => {
404                tracing::warn!(err = %e, "crawl fetch task cancelled");
405                continue;
406            }
407        };
408
409        let FetchOutcome {
410            url,
411            depth,
412            result,
413            fetched_at,
414        } = outcome;
415        let page = match result {
416            Ok(p) => p,
417            Err(msg) => {
418                let r = error_result(&url, depth, msg, fetched_at);
419                on_page(&r);
420                results.push(r);
421                continue;
422            }
423        };
424
425        let budget_used = results.len() + in_flight.len() + 1;
426        let mut ctx = CrawlContext {
427            frontier: &mut frontier,
428            robots: &robots,
429            opts: &opts,
430        };
431        if let Some(r) = process_ok_fetch(&mut ctx, &url, depth, &page, budget_used, fetched_at) {
432            on_page(&r);
433            results.push(r);
434        }
435    }
436
437    results
438}
439
440fn spawn_fetch(
441    in_flight: &mut JoinSet<FetchOutcome>,
442    fetcher: &(impl PageFetcher + Clone),
443    opts: &CrawlPlan,
444    url: Url,
445    depth: usize,
446) {
447    let url_str = url.to_string();
448    let timeout = opts.timeout_secs;
449    let settle = opts.settle_ms;
450    let user_agent = opts.user_agent.clone();
451    let f = fetcher.clone();
452    in_flight.spawn_blocking(move || {
453        let result = f
454            .fetch_page(bridge::FetchOptions {
455                url: &url_str,
456                timeout_secs: timeout,
457                settle_ms: settle,
458                mode: bridge::FetchMode::Content { include_a11y: false },
459                user_agent: user_agent.as_deref(),
460            })
461            .map_err(|e| format!("{e:#}"));
462        FetchOutcome {
463            url,
464            depth,
465            result,
466            fetched_at: SystemTime::now(),
467        }
468    });
469}
470
471/// Stable crawl state passed to `process_ok_fetch`.
472struct CrawlContext<'a> {
473    frontier: &'a mut Frontier,
474    robots: &'a RobotsPolicy,
475    opts: &'a CrawlPlan,
476}
477
478/// Build a `CrawlPageResult` and enqueue discovered links.
479fn process_ok_fetch(
480    ctx: &mut CrawlContext<'_>,
481    url: &Url,
482    depth: usize,
483    page: &bridge::ServoPage,
484    budget_used: usize,
485    fetched_at: SystemTime,
486) -> Option<CrawlPageResult> {
487    let html = if page.html.len() > MAX_HTML_BYTES {
488        &page.html[..crate::sanitize::floor_char_boundary(&page.html, MAX_HTML_BYTES)]
489    } else {
490        &page.html
491    };
492
493    let input = crate::extract::ExtractInput::new(html, url.as_str())
494        .with_layout_json(page.layout_json.as_deref())
495        .with_inner_text(page.inner_text.as_deref())
496        .with_selector(ctx.opts.selector.as_deref());
497
498    let content = if ctx.opts.json {
499        crate::extract::extract_json(&input).ok()
500    } else {
501        crate::extract::extract_text(&input).ok()
502    };
503
504    if content.as_ref().is_some_and(|c| ctx.frontier.is_duplicate_content(c)) {
505        return None;
506    }
507
508    let links = extract_links_from_html(html, url);
509    let links_found = links.len();
510
511    if depth < ctx.opts.max_depth {
512        for link in &links {
513            if budget_used + ctx.frontier.pending() >= ctx.opts.limit {
514                break;
515            }
516            if !is_same_site(&ctx.opts.seed, link)
517                || net::validate_url_with_policy(link.as_str(), bridge::engine_policy()).is_err()
518                || !ctx.robots.is_allowed(link)
519                || !matches_scope(link, ctx.opts.include.as_ref(), ctx.opts.exclude.as_ref())
520            {
521                continue;
522            }
523            ctx.frontier.try_enqueue(link.clone(), depth + 1);
524        }
525    }
526
527    let title = {
528        let doc = dom_query::Document::from(html);
529        let t = doc.select("title").text().to_string();
530        (!t.is_empty()).then_some(t)
531    };
532
533    Some(CrawlPageResult {
534        url: url.to_string(),
535        depth,
536        status: CrawlStatus::Ok,
537        title,
538        content: content.map(|c| crate::sanitize::sanitize(&c).into_owned()),
539        error: None,
540        links_found,
541        fetched_at,
542    })
543}
544
545/// Fetch result crossing the `JoinSet` boundary; error is pre-stringified for `Send`.
546struct FetchOutcome {
547    url: Url,
548    depth: usize,
549    result: Result<bridge::ServoPage, String>,
550    fetched_at: SystemTime,
551}
552
553fn error_result(url: &Url, depth: usize, error: String, fetched_at: SystemTime) -> CrawlPageResult {
554    CrawlPageResult {
555        url: url.to_string(),
556        depth,
557        status: CrawlStatus::Error,
558        title: None,
559        content: None,
560        error: Some(error),
561        links_found: 0,
562        fetched_at,
563    }
564}
565
566#[cfg(test)]
567mod tests {
568    use std::collections::HashMap;
569    use std::sync::Arc;
570
571    use super::*;
572
573    #[test]
574    fn crawl_options_defaults() {
575        let opts = CrawlOptions::new("https://example.com");
576        assert_eq!(opts.url, "https://example.com");
577        assert_eq!(opts.limit, 50);
578        assert_eq!(opts.max_depth, 3);
579        assert_eq!(opts.timeout, Duration::from_secs(30));
580        assert!(opts.include.is_empty());
581        assert!(opts.exclude.is_empty());
582        assert_eq!(opts.concurrency, 1);
583        assert_eq!(opts.delay, Some(Duration::from_millis(500)));
584    }
585
586    #[test]
587    fn crawl_options_chaining() {
588        let opts = CrawlOptions::new("https://example.com")
589            .limit(100)
590            .max_depth(5)
591            .timeout(Duration::from_secs(60))
592            .include(&["/docs/**"])
593            .exclude(&["/docs/archive/**"])
594            .concurrency(4)
595            .delay(None);
596        assert_eq!(opts.limit, 100);
597        assert_eq!(opts.max_depth, 5);
598        assert_eq!(opts.include, vec!["/docs/**"]);
599        assert_eq!(opts.exclude, vec!["/docs/archive/**"]);
600        assert_eq!(opts.concurrency, 4);
601        assert_eq!(opts.delay, None);
602    }
603
604    #[test]
605    fn crawl_options_concurrency_clamps_below_one() {
606        let opts = CrawlOptions::new("https://example.com").concurrency(0);
607        assert_eq!(opts.concurrency, 1);
608    }
609
610    #[test]
611    fn crawl_options_delay_custom_value() {
612        let opts = CrawlOptions::new("https://example.com").delay(Some(Duration::from_secs(2)));
613        assert_eq!(opts.delay, Some(Duration::from_secs(2)));
614    }
615
616    #[test]
617    fn crawl_user_agent_sanitizes_crlf() {
618        let opts = CrawlOptions::new("https://example.com").user_agent("Crawler\r\n/2.0");
619        assert_eq!(opts.user_agent.as_deref(), Some("Crawler  /2.0"));
620    }
621
622    #[derive(Clone)]
623    struct MockFetcher(Arc<HashMap<String, String>>);
624
625    impl MockFetcher {
626        fn new(pages: &[(&str, &str)]) -> Self {
627            Self(Arc::new(
628                pages.iter().map(|(u, h)| (u.to_string(), h.to_string())).collect(),
629            ))
630        }
631    }
632
633    impl PageFetcher for MockFetcher {
634        fn fetch_page(&self, opts: bridge::FetchOptions<'_>) -> anyhow::Result<bridge::ServoPage> {
635            self.0
636                .get(opts.url)
637                .map(|html| bridge::ServoPage {
638                    html: html.clone(),
639                    ..Default::default()
640                })
641                .ok_or_else(|| anyhow::anyhow!("not found: {}", opts.url))
642        }
643    }
644
645    fn page(links: &[&str]) -> String {
646        use std::fmt::Write as _;
647        let mut anchors = String::new();
648        for l in links {
649            write!(anchors, r#"<a href="{l}">link</a>"#).unwrap();
650        }
651        format!("<html><head><title>Test</title></head><body>{anchors}</body></html>")
652    }
653
654    /// Leaf page with unique body to avoid content-hash dedup.
655    fn distinct_page(tag: &str) -> String {
656        format!("<html><head><title>{tag}</title></head><body>page {tag}</body></html>")
657    }
658
659    /// Test helper: build `CrawlPlan`, run, assert. `delay=None` keeps tests fast.
660    async fn check(
661        pages: &[(&str, &str)],
662        configure: impl FnOnce(&mut CrawlPlan),
663        assert: impl FnOnce(&[CrawlPageResult]),
664    ) {
665        let fetcher = MockFetcher::new(pages);
666        let seed = pages[0].0;
667        let mut opts = CrawlPlan {
668            seed: Url::parse(seed).unwrap(),
669            limit: 50,
670            max_depth: 3,
671            timeout_secs: 30,
672            settle_ms: 0,
673            include: None,
674            exclude: None,
675            selector: None,
676            json: false,
677            user_agent: None,
678            concurrency: 1,
679            delay: None,
680        };
681        configure(&mut opts);
682        let results = run(opts, RobotsPolicy::Unavailable, &fetcher, |_| {}).await;
683        assert(&results);
684    }
685
686    #[tokio::test]
687    async fn crawl_single_page() {
688        check(
689            &[("https://example.com/", &page(&[]))],
690            |_| {},
691            |r| {
692                assert_eq!(r.len(), 1);
693                assert_eq!(r[0].url, "https://example.com/");
694            },
695        )
696        .await;
697    }
698
699    #[tokio::test]
700    async fn crawl_follows_links() {
701        check(
702            &[
703                ("https://example.com/", &page(&["/a", "/b"])),
704                (
705                    "https://example.com/a",
706                    "<html><head><title>A</title></head><body>page a</body></html>",
707                ),
708                (
709                    "https://example.com/b",
710                    "<html><head><title>B</title></head><body>page b</body></html>",
711                ),
712            ],
713            |_| {},
714            |r| assert_eq!(r.len(), 3),
715        )
716        .await;
717    }
718
719    #[tokio::test]
720    async fn crawl_respects_depth_limit() {
721        check(
722            &[
723                ("https://example.com/", &page(&["/a"])),
724                ("https://example.com/a", &page(&["/b"])),
725                ("https://example.com/b", &page(&["/c"])),
726                ("https://example.com/c", &page(&[])),
727            ],
728            |o| o.max_depth = 1,
729            |r| assert_eq!(r.len(), 2),
730        )
731        .await;
732    }
733
734    #[tokio::test]
735    async fn crawl_respects_limit() {
736        check(
737            &[
738                ("https://example.com/", &page(&["/a", "/b", "/c"])),
739                ("https://example.com/a", &page(&[])),
740                ("https://example.com/b", &page(&[])),
741                ("https://example.com/c", &page(&[])),
742            ],
743            |o| o.limit = 2,
744            |r| assert_eq!(r.len(), 2),
745        )
746        .await;
747    }
748
749    #[tokio::test]
750    async fn crawl_skips_cross_site_links() {
751        check(
752            &[
753                ("https://example.com/", &page(&["https://other.com/x"])),
754                ("https://other.com/x", &page(&[])),
755            ],
756            |_| {},
757            |r| assert_eq!(r.len(), 1),
758        )
759        .await;
760    }
761
762    #[tokio::test]
763    async fn crawl_deduplicates_urls() {
764        check(
765            &[
766                ("https://example.com/", &page(&["/a", "/a", "/a"])),
767                ("https://example.com/a", &page(&["/"])),
768            ],
769            |_| {},
770            |r| assert_eq!(r.len(), 2),
771        )
772        .await;
773    }
774
775    #[tokio::test]
776    async fn crawl_handles_fetch_errors() {
777        check(
778            &[("https://example.com/", &page(&["/missing"]))],
779            |_| {},
780            |r| {
781                assert_eq!(r.len(), 2);
782                assert!(matches!(r[1].status, CrawlStatus::Error));
783                assert!(r[1].error.is_some());
784            },
785        )
786        .await;
787    }
788
789    #[tokio::test]
790    async fn crawl_applies_include_glob() {
791        check(
792            &[
793                ("https://example.com/", &page(&["/docs/a", "/blog/b"])),
794                ("https://example.com/docs/a", &page(&[])),
795                ("https://example.com/blog/b", &page(&[])),
796            ],
797            |o| o.include = Some(crate::scope::build_globset(&["/docs/**".into()]).unwrap()),
798            |r| {
799                assert_eq!(r.len(), 2);
800                assert!(r.iter().any(|p| p.url == "https://example.com/docs/a"));
801                assert!(!r.iter().any(|p| p.url == "https://example.com/blog/b"));
802            },
803        )
804        .await;
805    }
806
807    #[tokio::test]
808    async fn crawl_applies_exclude_glob() {
809        check(
810            &[
811                ("https://example.com/", &page(&["/public", "/secret/data"])),
812                ("https://example.com/public", &page(&[])),
813                ("https://example.com/secret/data", &page(&[])),
814            ],
815            |o| o.exclude = Some(crate::scope::build_globset(&["/secret/**".into()]).unwrap()),
816            |r| {
817                assert_eq!(r.len(), 2);
818                assert!(!r.iter().any(|p| p.url == "https://example.com/secret/data"));
819            },
820        )
821        .await;
822    }
823
824    #[tokio::test]
825    async fn crawl_deduplicates_content() {
826        let same = "<html><head><title>Same</title></head><body>identical</body></html>";
827        check(
828            &[
829                ("https://example.com/", &page(&["/a", "/b"])),
830                ("https://example.com/a", same),
831                ("https://example.com/b", same),
832            ],
833            |_| {},
834            |r| assert_eq!(r.len(), 2),
835        )
836        .await;
837    }
838
839    #[tokio::test]
840    async fn crawl_concurrency_visits_all_pages() {
841        check(
842            &[
843                ("https://example.com/", &page(&["/a", "/b", "/c", "/d"])),
844                ("https://example.com/a", &distinct_page("a")),
845                ("https://example.com/b", &distinct_page("b")),
846                ("https://example.com/c", &distinct_page("c")),
847                ("https://example.com/d", &distinct_page("d")),
848            ],
849            |o| o.concurrency = 4,
850            |r| {
851                assert_eq!(r.len(), 5);
852                let urls: HashSet<&str> = r.iter().map(|p| p.url.as_str()).collect();
853                for u in [
854                    "https://example.com/",
855                    "https://example.com/a",
856                    "https://example.com/b",
857                    "https://example.com/c",
858                    "https://example.com/d",
859                ] {
860                    assert!(urls.contains(u), "missing {u}");
861                }
862            },
863        )
864        .await;
865    }
866
867    #[tokio::test]
868    async fn crawl_concurrency_respects_limit() {
869        check(
870            &[
871                ("https://example.com/", &page(&["/a", "/b", "/c", "/d"])),
872                ("https://example.com/a", &distinct_page("a")),
873                ("https://example.com/b", &distinct_page("b")),
874                ("https://example.com/c", &distinct_page("c")),
875                ("https://example.com/d", &distinct_page("d")),
876            ],
877            |o| {
878                o.concurrency = 4;
879                o.limit = 3;
880            },
881            |r| assert_eq!(r.len(), 3),
882        )
883        .await;
884    }
885
886    #[tokio::test]
887    async fn crawl_concurrency_one_preserves_bfs_order() {
888        check(
889            &[
890                ("https://example.com/", &page(&["/a", "/b"])),
891                ("https://example.com/a", &distinct_page("a")),
892                ("https://example.com/b", &distinct_page("b")),
893            ],
894            |o| o.concurrency = 1,
895            |r| {
896                assert_eq!(r.len(), 3);
897                assert_eq!(r[0].url, "https://example.com/");
898                assert_eq!(r[1].url, "https://example.com/a");
899                assert_eq!(r[2].url, "https://example.com/b");
900            },
901        )
902        .await;
903    }
904
905    #[tokio::test(start_paused = true)]
906    async fn crawl_delay_enforces_minimum_interval() {
907        // 3 pages at 500ms delay = 2 ticks >= 1s (first dispatch is free).
908        let start = tokio::time::Instant::now();
909        check(
910            &[
911                ("https://example.com/", &page(&["/a", "/b"])),
912                ("https://example.com/a", &distinct_page("a")),
913                ("https://example.com/b", &distinct_page("b")),
914            ],
915            |o| {
916                o.concurrency = 1;
917                o.delay = Some(Duration::from_millis(500));
918            },
919            |r| assert_eq!(r.len(), 3),
920        )
921        .await;
922        let elapsed = start.elapsed();
923        assert!(
924            elapsed >= Duration::from_secs(1),
925            "expected >= 1s for 3 pages with 500ms delay, got {elapsed:?}"
926        );
927    }
928
929    #[test]
930    fn frontier_dedup() {
931        let seed = Url::parse("https://example.com/").unwrap();
932        let mut f = Frontier::new(&seed);
933        assert!(!f.try_enqueue(seed, 0));
934        let other = Url::parse("https://example.com/page").unwrap();
935        assert!(f.try_enqueue(other.clone(), 1));
936        assert!(!f.try_enqueue(other, 1));
937    }
938
939    #[test]
940    fn frontier_pop_and_pending() {
941        let seed = Url::parse("https://example.com/").unwrap();
942        let mut f = Frontier::new(&seed);
943        assert_eq!(f.pending(), 1);
944        let (url, depth) = f.pop().unwrap();
945        assert_eq!(url.as_str(), "https://example.com/");
946        assert_eq!(depth, 0);
947        assert_eq!(f.pending(), 0);
948        assert!(f.pop().is_none());
949    }
950
951    #[test]
952    fn extract_links_filters_dangerous_schemes() {
953        let html = r#"<a href="https://example.com/a">A</a>
954            <a href="javascript:void(0)">JS</a>
955            <a href="JAVASCRIPT:alert(1)">JS upper</a>
956            <a href="data:text/html,<h1>hi</h1>">Data</a>
957            <a href="mailto:x@y.com">Mail</a>
958            <a href="/relative">Rel</a>"#;
959        let base = Url::parse("https://example.com/").unwrap();
960        let links = extract_links_from_html(html, &base);
961        assert_eq!(links.len(), 2);
962        assert_eq!(links[0].as_str(), "https://example.com/a");
963        assert_eq!(links[1].as_str(), "https://example.com/relative");
964    }
965
966    #[test]
967    fn error_result_fields() {
968        let url = Url::parse("https://example.com/fail").unwrap();
969        let r = error_result(&url, 2, "timeout".into(), SystemTime::now());
970        assert!(matches!(r.status, CrawlStatus::Error));
971        assert_eq!(r.error.as_deref(), Some("timeout"));
972        assert!(r.content.is_none());
973    }
974
975    #[test]
976    fn content_hash_dedup() {
977        let seed = Url::parse("https://example.com/").unwrap();
978        let mut f = Frontier::new(&seed);
979        assert!(!f.is_duplicate_content("unique content"));
980        assert!(f.is_duplicate_content("unique content"));
981        assert!(!f.is_duplicate_content("different content"));
982    }
983}
servo_fetch/crawl.rs

servo_fetch/
crawl.rs