servo_fetch/
crawl.rs

1//! Site crawling — BFS link traversal with scope, robots.txt, and rate limiting.
2
3use std::collections::{HashSet, VecDeque};
4use std::hash::{Hash, Hasher};
5use std::time::Duration;
6
7use tokio::task::JoinSet;
8use tokio::time::{MissedTickBehavior, interval};
9use url::Url;
10
11use crate::bridge::{self, PageFetcher};
12use crate::net;
13use crate::robots::RobotsPolicy;
14use crate::scope::{is_same_site, matches_scope, normalize_url};
15
16const MAX_HTML_BYTES: usize = 2 * 1024 * 1024;
17
18/// Options for crawling a site.
19#[must_use = "options do nothing until passed to crawl() or crawl_each()"]
20#[derive(Debug, Clone)]
21pub struct CrawlOptions {
22    pub(crate) url: String,
23    pub(crate) limit: usize,
24    pub(crate) max_depth: usize,
25    pub(crate) timeout: Duration,
26    pub(crate) settle: Duration,
27    pub(crate) include: Vec<String>,
28    pub(crate) exclude: Vec<String>,
29    pub(crate) selector: Option<String>,
30    pub(crate) json: bool,
31    pub(crate) user_agent: Option<String>,
32    pub(crate) concurrency: usize,
33    pub(crate) delay: Option<Duration>,
34}
35
36impl CrawlOptions {
37    /// Create crawl options for the given seed URL.
38    pub fn new(url: &str) -> Self {
39        Self {
40            url: url.into(),
41            limit: 50,
42            max_depth: 3,
43            timeout: Duration::from_secs(30),
44            settle: Duration::ZERO,
45            include: Vec::new(),
46            exclude: Vec::new(),
47            selector: None,
48            json: false,
49            user_agent: None,
50            concurrency: 1,
51            delay: Some(Duration::from_millis(500)),
52        }
53    }
54
55    /// Maximum number of pages to crawl (default: 50).
56    pub fn limit(mut self, n: usize) -> Self {
57        self.limit = n;
58        self
59    }
60
61    /// Maximum link depth from the seed URL (default: 3).
62    pub fn max_depth(mut self, n: usize) -> Self {
63        self.max_depth = n;
64        self
65    }
66
67    /// Page load timeout per page (default: 30s).
68    pub fn timeout(mut self, timeout: Duration) -> Self {
69        self.timeout = timeout;
70        self
71    }
72
73    /// Extra wait after load event per page (default: 0).
74    pub fn settle(mut self, settle: Duration) -> Self {
75        self.settle = settle;
76        self
77    }
78
79    /// URL path glob patterns to include (e.g. `"/docs/**"`).
80    pub fn include(mut self, patterns: &[&str]) -> Self {
81        self.include = patterns.iter().map(|s| (*s).to_string()).collect();
82        self
83    }
84
85    /// URL path glob patterns to exclude (e.g. `"/docs/archive/**"`).
86    pub fn exclude(mut self, patterns: &[&str]) -> Self {
87        self.exclude = patterns.iter().map(|s| (*s).to_string()).collect();
88        self
89    }
90
91    /// Output crawled content as JSON instead of Markdown.
92    pub fn json(mut self, json: bool) -> Self {
93        self.json = json;
94        self
95    }
96
97    /// CSS selector to extract a specific section per page.
98    pub fn selector(mut self, selector: impl Into<String>) -> Self {
99        self.selector = Some(selector.into());
100        self
101    }
102
103    /// Override the User-Agent string for all pages in this crawl.
104    pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
105        self.user_agent = Some(net::sanitize_user_agent(ua.into()));
106        self
107    }
108
109    /// Maximum parallel fetches (default: 1). Values below 1 are clamped to 1.
110    /// Results are yielded in completion order when greater than 1.
111    pub fn concurrency(mut self, n: usize) -> Self {
112        self.concurrency = n.max(1);
113        self
114    }
115
116    /// Minimum dispatch interval (default: `Some(500ms)`). `None` disables rate limiting.
117    pub fn delay(mut self, delay: Option<Duration>) -> Self {
118        self.delay = delay;
119        self
120    }
121}
122
123/// Result for a single crawled page.
124#[derive(Debug, Clone)]
125#[non_exhaustive]
126pub struct CrawlResult {
127    /// URL of the crawled page.
128    pub url: String,
129    /// Link depth from the seed URL.
130    pub depth: usize,
131    /// Page content if successful, or error if failed.
132    pub outcome: Result<CrawlPage, CrawlError>,
133}
134
135/// Successfully crawled page.
136#[derive(Debug, Clone)]
137pub struct CrawlPage {
138    /// Page title.
139    pub title: Option<String>,
140    /// Extracted content (Markdown or JSON depending on options).
141    pub content: String,
142    /// Number of links discovered on this page.
143    pub links_found: usize,
144}
145
146/// Error from a failed crawl attempt.
147#[derive(Debug, Clone)]
148pub struct CrawlError {
149    /// Error message.
150    pub message: String,
151}
152
153impl std::fmt::Display for CrawlError {
154    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
155        f.write_str(&self.message)
156    }
157}
158
159impl std::error::Error for CrawlError {}
160
161impl serde::Serialize for CrawlResult {
162    fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
163        use serde::ser::SerializeMap;
164        match &self.outcome {
165            Ok(page) => {
166                let mut map = serializer.serialize_map(None)?;
167                map.serialize_entry("url", &self.url)?;
168                map.serialize_entry("depth", &self.depth)?;
169                map.serialize_entry("status", "ok")?;
170                if let Some(t) = &page.title {
171                    map.serialize_entry("title", t)?;
172                }
173                map.serialize_entry("content", &page.content)?;
174                map.serialize_entry("links_found", &page.links_found)?;
175                map.end()
176            }
177            Err(e) => {
178                let mut map = serializer.serialize_map(None)?;
179                map.serialize_entry("url", &self.url)?;
180                map.serialize_entry("depth", &self.depth)?;
181                map.serialize_entry("status", "error")?;
182                map.serialize_entry("error", &e.message)?;
183                map.end()
184            }
185        }
186    }
187}
188
189impl CrawlResult {
190    fn from_internal(r: &CrawlPageResult) -> Self {
191        let outcome = match r.status {
192            CrawlStatus::Ok => Ok(CrawlPage {
193                title: r.title.clone(),
194                content: r.content.clone().unwrap_or_default(),
195                links_found: r.links_found,
196            }),
197            CrawlStatus::Error => Err(CrawlError {
198                message: r.error.clone().unwrap_or_default(),
199            }),
200        };
201        Self {
202            url: r.url.clone(),
203            depth: r.depth,
204            outcome,
205        }
206    }
207}
208
209/// Crawl a site, invoking `on_page` for each result as it arrives.
210#[allow(clippy::needless_pass_by_value)]
211pub fn crawl_each(opts: CrawlOptions, mut on_page: impl FnMut(&CrawlResult)) -> crate::error::Result<()> {
212    net::ensure_crypto_provider();
213    let plan = build_crawl_plan(&opts)?;
214    crate::runtime::block_on(async {
215        let robots = tokio::task::spawn_blocking({
216            let seed = plan.seed.clone();
217            let user_agent = plan.user_agent.clone();
218            let timeout = Duration::from_secs(plan.timeout_secs);
219            move || crate::robots::RobotsRules::fetch(&seed, user_agent.as_deref(), timeout)
220        })
221        .await
222        .unwrap_or(RobotsPolicy::Unreachable);
223        run(plan, robots, &bridge::ServoFetcher, |r| {
224            on_page(&CrawlResult::from_internal(r));
225        })
226        .await
227    })
228    .map_err(|e| crate::error::Error::Engine(e.to_string()))?;
229    Ok(())
230}
231
232/// Crawl a site and collect all results.
233#[allow(clippy::needless_pass_by_value)]
234pub fn crawl(opts: CrawlOptions) -> crate::error::Result<Vec<CrawlResult>> {
235    let mut results = Vec::new();
236    crawl_each(opts, |r| results.push(r.clone()))?;
237    Ok(results)
238}
239
240fn build_crawl_plan(opts: &CrawlOptions) -> crate::error::Result<CrawlPlan> {
241    let seed = net::validate_url(&opts.url)?;
242    let include = if opts.include.is_empty() {
243        None
244    } else {
245        Some(crate::scope::build_globset(&opts.include)?)
246    };
247    let exclude = if opts.exclude.is_empty() {
248        None
249    } else {
250        Some(crate::scope::build_globset(&opts.exclude)?)
251    };
252    Ok(CrawlPlan {
253        seed,
254        limit: opts.limit,
255        max_depth: opts.max_depth,
256        timeout_secs: opts.timeout.as_secs().max(1),
257        settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
258        include,
259        exclude,
260        selector: opts.selector.clone(),
261        json: opts.json,
262        user_agent: opts.user_agent.clone(),
263        concurrency: opts.concurrency,
264        delay: opts.delay,
265    })
266}
267
268/// Crawl configuration.
269pub(crate) struct CrawlPlan {
270    pub seed: Url,
271    pub limit: usize,
272    pub max_depth: usize,
273    pub timeout_secs: u64,
274    pub settle_ms: u64,
275    pub include: Option<globset::GlobSet>,
276    pub exclude: Option<globset::GlobSet>,
277    pub selector: Option<String>,
278    pub json: bool,
279    pub user_agent: Option<String>,
280    /// Parallel fetch limit (clamped to >=1; yields in completion order when >1).
281    pub concurrency: usize,
282    /// Dispatch interval; `None` disables rate limiting.
283    pub delay: Option<Duration>,
284}
285
286/// Result for a single crawled page.
287#[derive(serde::Serialize)]
288pub(crate) struct CrawlPageResult {
289    pub url: String,
290    pub depth: usize,
291    pub status: CrawlStatus,
292    #[serde(skip_serializing_if = "Option::is_none")]
293    pub title: Option<String>,
294    #[serde(skip_serializing_if = "Option::is_none")]
295    pub content: Option<String>,
296    #[serde(skip_serializing_if = "Option::is_none")]
297    pub error: Option<String>,
298    pub links_found: usize,
299}
300
301/// Status of a crawled page.
302#[derive(serde::Serialize)]
303#[serde(rename_all = "lowercase")]
304pub(crate) enum CrawlStatus {
305    Ok,
306    Error,
307}
308
309struct Frontier {
310    queue: VecDeque<(Url, usize)>,
311    visited: HashSet<String>,
312    content_hashes: HashSet<u64>,
313}
314
315impl Frontier {
316    fn new(seed: &Url) -> Self {
317        Self {
318            queue: VecDeque::from([(seed.clone(), 0)]),
319            visited: HashSet::from([normalize_url(seed)]),
320            content_hashes: HashSet::new(),
321        }
322    }
323
324    fn try_enqueue(&mut self, url: Url, depth: usize) -> bool {
325        if self.visited.insert(normalize_url(&url)) {
326            self.queue.push_back((url, depth));
327            true
328        } else {
329            false
330        }
331    }
332
333    fn pop(&mut self) -> Option<(Url, usize)> {
334        self.queue.pop_front()
335    }
336
337    fn is_duplicate_content(&mut self, content: &str) -> bool {
338        let mut h = std::hash::DefaultHasher::new();
339        content.hash(&mut h);
340        !self.content_hashes.insert(h.finish())
341    }
342
343    fn pending(&self) -> usize {
344        self.queue.len()
345    }
346}
347
348fn extract_links_from_html(html: &str, base: &Url) -> Vec<Url> {
349    dom_query::Document::from(html)
350        .select("a[href]")
351        .iter()
352        .filter_map(|el| {
353            let href = el.attr("href")?;
354            let href = href.trim();
355            if href.is_empty() {
356                return None;
357            }
358            let resolved = base.join(href).ok()?;
359            matches!(resolved.scheme(), "http" | "https").then_some(resolved)
360        })
361        .collect()
362}
363
364pub(crate) async fn run(
365    opts: CrawlPlan,
366    robots: RobotsPolicy,
367    fetcher: &(impl PageFetcher + Clone),
368    mut on_page: impl FnMut(&CrawlPageResult),
369) -> Vec<CrawlPageResult> {
370    let mut frontier = Frontier::new(&opts.seed);
371    let mut results = Vec::new();
372    let mut in_flight: JoinSet<FetchOutcome> = JoinSet::new();
373
374    // `Delay` keeps the steady-state rate correct after fetches exceed `delay`.
375    let mut ticker = opts.delay.map(|period| {
376        let mut t = interval(period);
377        t.set_missed_tick_behavior(MissedTickBehavior::Delay);
378        t
379    });
380
381    let concurrency = opts.concurrency.max(1);
382
383    loop {
384        while in_flight.len() < concurrency && results.len() + in_flight.len() < opts.limit {
385            let Some((url, depth)) = frontier.pop() else {
386                break;
387            };
388            if let Some(t) = ticker.as_mut() {
389                t.tick().await;
390            }
391            spawn_fetch(&mut in_flight, fetcher, &opts, url, depth);
392        }
393
394        let outcome = match in_flight.join_next().await {
395            None => break,
396            Some(Ok(o)) => o,
397            Some(Err(e)) if e.is_panic() => {
398                tracing::error!(err = %e, "crawl fetch task panicked");
399                continue;
400            }
401            Some(Err(e)) => {
402                tracing::warn!(err = %e, "crawl fetch task cancelled");
403                continue;
404            }
405        };
406
407        let FetchOutcome { url, depth, result } = outcome;
408        let page = match result {
409            Ok(p) => p,
410            Err(msg) => {
411                let r = error_result(&url, depth, msg);
412                on_page(&r);
413                results.push(r);
414                continue;
415            }
416        };
417
418        // +1 counts the current page, which is about to be pushed.
419        let budget_used = results.len() + in_flight.len() + 1;
420        if let Some(r) = process_ok_fetch(&url, depth, &page, &mut frontier, &robots, &opts, budget_used) {
421            on_page(&r);
422            results.push(r);
423        }
424    }
425
426    results
427}
428
429fn spawn_fetch(
430    in_flight: &mut JoinSet<FetchOutcome>,
431    fetcher: &(impl PageFetcher + Clone),
432    opts: &CrawlPlan,
433    url: Url,
434    depth: usize,
435) {
436    let url_str = url.to_string();
437    let timeout = opts.timeout_secs;
438    let settle = opts.settle_ms;
439    let user_agent = opts.user_agent.clone();
440    let f = fetcher.clone();
441    in_flight.spawn_blocking(move || FetchOutcome {
442        url,
443        depth,
444        result: f
445            .fetch_page(bridge::FetchOptions {
446                url: &url_str,
447                timeout_secs: timeout,
448                settle_ms: settle,
449                mode: bridge::FetchMode::Content { include_a11y: false },
450                user_agent: user_agent.as_deref(),
451            })
452            .map_err(|e| format!("{e:#}")),
453    });
454}
455
456/// Build a `CrawlPageResult` and enqueue discovered links.
457fn process_ok_fetch(
458    url: &Url,
459    depth: usize,
460    page: &bridge::ServoPage,
461    frontier: &mut Frontier,
462    robots: &RobotsPolicy,
463    opts: &CrawlPlan,
464    budget_used: usize,
465) -> Option<CrawlPageResult> {
466    let html = if page.html.len() > MAX_HTML_BYTES {
467        &page.html[..crate::sanitize::floor_char_boundary(&page.html, MAX_HTML_BYTES)]
468    } else {
469        &page.html
470    };
471
472    let input = crate::extract::ExtractInput::new(html, url.as_str())
473        .with_layout_json(page.layout_json.as_deref())
474        .with_inner_text(page.inner_text.as_deref())
475        .with_selector(opts.selector.as_deref());
476
477    let content = if opts.json {
478        crate::extract::extract_json(&input).ok()
479    } else {
480        crate::extract::extract_text(&input).ok()
481    };
482
483    if content.as_ref().is_some_and(|c| frontier.is_duplicate_content(c)) {
484        return None;
485    }
486
487    let links = extract_links_from_html(html, url);
488    let links_found = links.len();
489
490    if depth < opts.max_depth {
491        for link in &links {
492            if budget_used + frontier.pending() >= opts.limit {
493                break;
494            }
495            if !is_same_site(&opts.seed, link)
496                || net::validate_url_with_policy(link.as_str(), bridge::engine_policy()).is_err()
497                || !robots.is_allowed(link)
498                || !matches_scope(link, opts.include.as_ref(), opts.exclude.as_ref())
499            {
500                continue;
501            }
502            frontier.try_enqueue(link.clone(), depth + 1);
503        }
504    }
505
506    let title = {
507        let doc = dom_query::Document::from(html);
508        let t = doc.select("title").text().to_string();
509        (!t.is_empty()).then_some(t)
510    };
511
512    Some(CrawlPageResult {
513        url: url.to_string(),
514        depth,
515        status: CrawlStatus::Ok,
516        title,
517        content: content.map(|c| crate::sanitize::sanitize(&c).into_owned()),
518        error: None,
519        links_found,
520    })
521}
522
523/// Fetch result crossing the `JoinSet` boundary; error is pre-stringified for `Send`.
524struct FetchOutcome {
525    url: Url,
526    depth: usize,
527    result: Result<bridge::ServoPage, String>,
528}
529
530fn error_result(url: &Url, depth: usize, error: String) -> CrawlPageResult {
531    CrawlPageResult {
532        url: url.to_string(),
533        depth,
534        status: CrawlStatus::Error,
535        title: None,
536        content: None,
537        error: Some(error),
538        links_found: 0,
539    }
540}
541
542#[cfg(test)]
543mod tests {
544    use super::*;
545    use std::collections::HashMap;
546    use std::sync::Arc;
547
548    #[test]
549    fn crawl_options_defaults() {
550        let opts = CrawlOptions::new("https://example.com");
551        assert_eq!(opts.url, "https://example.com");
552        assert_eq!(opts.limit, 50);
553        assert_eq!(opts.max_depth, 3);
554        assert_eq!(opts.timeout, Duration::from_secs(30));
555        assert!(opts.include.is_empty());
556        assert!(opts.exclude.is_empty());
557        assert_eq!(opts.concurrency, 1);
558        assert_eq!(opts.delay, Some(Duration::from_millis(500)));
559    }
560
561    #[test]
562    fn crawl_options_chaining() {
563        let opts = CrawlOptions::new("https://example.com")
564            .limit(100)
565            .max_depth(5)
566            .timeout(Duration::from_secs(60))
567            .include(&["/docs/**"])
568            .exclude(&["/docs/archive/**"])
569            .concurrency(4)
570            .delay(None);
571        assert_eq!(opts.limit, 100);
572        assert_eq!(opts.max_depth, 5);
573        assert_eq!(opts.include, vec!["/docs/**"]);
574        assert_eq!(opts.exclude, vec!["/docs/archive/**"]);
575        assert_eq!(opts.concurrency, 4);
576        assert_eq!(opts.delay, None);
577    }
578
579    #[test]
580    fn crawl_options_concurrency_clamps_below_one() {
581        let opts = CrawlOptions::new("https://example.com").concurrency(0);
582        assert_eq!(opts.concurrency, 1);
583    }
584
585    #[test]
586    fn crawl_options_delay_custom_value() {
587        let opts = CrawlOptions::new("https://example.com").delay(Some(Duration::from_secs(2)));
588        assert_eq!(opts.delay, Some(Duration::from_secs(2)));
589    }
590
591    #[test]
592    fn crawl_user_agent_sanitizes_crlf() {
593        let opts = CrawlOptions::new("https://example.com").user_agent("Crawler\r\n/2.0");
594        assert_eq!(opts.user_agent.as_deref(), Some("Crawler  /2.0"));
595    }
596
597    #[derive(Clone)]
598    struct MockFetcher(Arc<HashMap<String, String>>);
599
600    impl MockFetcher {
601        fn new(pages: &[(&str, &str)]) -> Self {
602            Self(Arc::new(
603                pages.iter().map(|(u, h)| (u.to_string(), h.to_string())).collect(),
604            ))
605        }
606    }
607
608    impl PageFetcher for MockFetcher {
609        fn fetch_page(&self, opts: bridge::FetchOptions<'_>) -> anyhow::Result<bridge::ServoPage> {
610            self.0
611                .get(opts.url)
612                .map(|html| bridge::ServoPage {
613                    html: html.clone(),
614                    ..Default::default()
615                })
616                .ok_or_else(|| anyhow::anyhow!("not found: {}", opts.url))
617        }
618    }
619
620    fn page(links: &[&str]) -> String {
621        use std::fmt::Write;
622        let mut anchors = String::new();
623        for l in links {
624            write!(anchors, r#"<a href="{l}">link</a>"#).unwrap();
625        }
626        format!("<html><head><title>Test</title></head><body>{anchors}</body></html>")
627    }
628
629    /// Leaf page with unique body to avoid content-hash dedup.
630    fn distinct_page(tag: &str) -> String {
631        format!("<html><head><title>{tag}</title></head><body>page {tag}</body></html>")
632    }
633
634    /// Test helper: build `CrawlPlan`, run, assert. `delay=None` keeps tests fast.
635    async fn check(
636        pages: &[(&str, &str)],
637        configure: impl FnOnce(&mut CrawlPlan),
638        assert: impl FnOnce(&[CrawlPageResult]),
639    ) {
640        let fetcher = MockFetcher::new(pages);
641        let seed = pages[0].0;
642        let mut opts = CrawlPlan {
643            seed: Url::parse(seed).unwrap(),
644            limit: 50,
645            max_depth: 3,
646            timeout_secs: 30,
647            settle_ms: 0,
648            include: None,
649            exclude: None,
650            selector: None,
651            json: false,
652            user_agent: None,
653            concurrency: 1,
654            delay: None,
655        };
656        configure(&mut opts);
657        let results = run(opts, RobotsPolicy::Unavailable, &fetcher, |_| {}).await;
658        assert(&results);
659    }
660
661    #[tokio::test]
662    async fn crawl_single_page() {
663        check(
664            &[("https://example.com/", &page(&[]))],
665            |_| {},
666            |r| {
667                assert_eq!(r.len(), 1);
668                assert_eq!(r[0].url, "https://example.com/");
669            },
670        )
671        .await;
672    }
673
674    #[tokio::test]
675    async fn crawl_follows_links() {
676        check(
677            &[
678                ("https://example.com/", &page(&["/a", "/b"])),
679                (
680                    "https://example.com/a",
681                    "<html><head><title>A</title></head><body>page a</body></html>",
682                ),
683                (
684                    "https://example.com/b",
685                    "<html><head><title>B</title></head><body>page b</body></html>",
686                ),
687            ],
688            |_| {},
689            |r| assert_eq!(r.len(), 3),
690        )
691        .await;
692    }
693
694    #[tokio::test]
695    async fn crawl_respects_depth_limit() {
696        check(
697            &[
698                ("https://example.com/", &page(&["/a"])),
699                ("https://example.com/a", &page(&["/b"])),
700                ("https://example.com/b", &page(&["/c"])),
701                ("https://example.com/c", &page(&[])),
702            ],
703            |o| o.max_depth = 1,
704            |r| assert_eq!(r.len(), 2),
705        )
706        .await;
707    }
708
709    #[tokio::test]
710    async fn crawl_respects_limit() {
711        check(
712            &[
713                ("https://example.com/", &page(&["/a", "/b", "/c"])),
714                ("https://example.com/a", &page(&[])),
715                ("https://example.com/b", &page(&[])),
716                ("https://example.com/c", &page(&[])),
717            ],
718            |o| o.limit = 2,
719            |r| assert_eq!(r.len(), 2),
720        )
721        .await;
722    }
723
724    #[tokio::test]
725    async fn crawl_skips_cross_site_links() {
726        check(
727            &[
728                ("https://example.com/", &page(&["https://other.com/x"])),
729                ("https://other.com/x", &page(&[])),
730            ],
731            |_| {},
732            |r| assert_eq!(r.len(), 1),
733        )
734        .await;
735    }
736
737    #[tokio::test]
738    async fn crawl_deduplicates_urls() {
739        check(
740            &[
741                ("https://example.com/", &page(&["/a", "/a", "/a"])),
742                ("https://example.com/a", &page(&["/"])),
743            ],
744            |_| {},
745            |r| assert_eq!(r.len(), 2),
746        )
747        .await;
748    }
749
750    #[tokio::test]
751    async fn crawl_handles_fetch_errors() {
752        check(
753            &[("https://example.com/", &page(&["/missing"]))],
754            |_| {},
755            |r| {
756                assert_eq!(r.len(), 2);
757                assert!(matches!(r[1].status, CrawlStatus::Error));
758                assert!(r[1].error.is_some());
759            },
760        )
761        .await;
762    }
763
764    #[tokio::test]
765    async fn crawl_applies_include_glob() {
766        check(
767            &[
768                ("https://example.com/", &page(&["/docs/a", "/blog/b"])),
769                ("https://example.com/docs/a", &page(&[])),
770                ("https://example.com/blog/b", &page(&[])),
771            ],
772            |o| o.include = Some(crate::scope::build_globset(&["/docs/**".into()]).unwrap()),
773            |r| {
774                assert_eq!(r.len(), 2);
775                assert!(r.iter().any(|p| p.url == "https://example.com/docs/a"));
776                assert!(!r.iter().any(|p| p.url == "https://example.com/blog/b"));
777            },
778        )
779        .await;
780    }
781
782    #[tokio::test]
783    async fn crawl_applies_exclude_glob() {
784        check(
785            &[
786                ("https://example.com/", &page(&["/public", "/secret/data"])),
787                ("https://example.com/public", &page(&[])),
788                ("https://example.com/secret/data", &page(&[])),
789            ],
790            |o| o.exclude = Some(crate::scope::build_globset(&["/secret/**".into()]).unwrap()),
791            |r| {
792                assert_eq!(r.len(), 2);
793                assert!(!r.iter().any(|p| p.url == "https://example.com/secret/data"));
794            },
795        )
796        .await;
797    }
798
799    #[tokio::test]
800    async fn crawl_deduplicates_content() {
801        let same = "<html><head><title>Same</title></head><body>identical</body></html>";
802        check(
803            &[
804                ("https://example.com/", &page(&["/a", "/b"])),
805                ("https://example.com/a", same),
806                ("https://example.com/b", same),
807            ],
808            |_| {},
809            |r| assert_eq!(r.len(), 2),
810        )
811        .await;
812    }
813
814    #[tokio::test]
815    async fn crawl_concurrency_visits_all_pages() {
816        check(
817            &[
818                ("https://example.com/", &page(&["/a", "/b", "/c", "/d"])),
819                ("https://example.com/a", &distinct_page("a")),
820                ("https://example.com/b", &distinct_page("b")),
821                ("https://example.com/c", &distinct_page("c")),
822                ("https://example.com/d", &distinct_page("d")),
823            ],
824            |o| o.concurrency = 4,
825            |r| {
826                assert_eq!(r.len(), 5);
827                let urls: HashSet<&str> = r.iter().map(|p| p.url.as_str()).collect();
828                for u in [
829                    "https://example.com/",
830                    "https://example.com/a",
831                    "https://example.com/b",
832                    "https://example.com/c",
833                    "https://example.com/d",
834                ] {
835                    assert!(urls.contains(u), "missing {u}");
836                }
837            },
838        )
839        .await;
840    }
841
842    #[tokio::test]
843    async fn crawl_concurrency_respects_limit() {
844        check(
845            &[
846                ("https://example.com/", &page(&["/a", "/b", "/c", "/d"])),
847                ("https://example.com/a", &distinct_page("a")),
848                ("https://example.com/b", &distinct_page("b")),
849                ("https://example.com/c", &distinct_page("c")),
850                ("https://example.com/d", &distinct_page("d")),
851            ],
852            |o| {
853                o.concurrency = 4;
854                o.limit = 3;
855            },
856            |r| assert_eq!(r.len(), 3),
857        )
858        .await;
859    }
860
861    #[tokio::test]
862    async fn crawl_concurrency_one_preserves_bfs_order() {
863        check(
864            &[
865                ("https://example.com/", &page(&["/a", "/b"])),
866                ("https://example.com/a", &distinct_page("a")),
867                ("https://example.com/b", &distinct_page("b")),
868            ],
869            |o| o.concurrency = 1,
870            |r| {
871                assert_eq!(r.len(), 3);
872                assert_eq!(r[0].url, "https://example.com/");
873                assert_eq!(r[1].url, "https://example.com/a");
874                assert_eq!(r[2].url, "https://example.com/b");
875            },
876        )
877        .await;
878    }
879
880    #[tokio::test(start_paused = true)]
881    async fn crawl_delay_enforces_minimum_interval() {
882        // 3 pages at 500ms delay = 2 ticks >= 1s (first dispatch is free).
883        let start = tokio::time::Instant::now();
884        check(
885            &[
886                ("https://example.com/", &page(&["/a", "/b"])),
887                ("https://example.com/a", &distinct_page("a")),
888                ("https://example.com/b", &distinct_page("b")),
889            ],
890            |o| {
891                o.concurrency = 1;
892                o.delay = Some(Duration::from_millis(500));
893            },
894            |r| assert_eq!(r.len(), 3),
895        )
896        .await;
897        let elapsed = start.elapsed();
898        assert!(
899            elapsed >= Duration::from_secs(1),
900            "expected >= 1s for 3 pages with 500ms delay, got {elapsed:?}"
901        );
902    }
903
904    #[test]
905    fn frontier_dedup() {
906        let seed = Url::parse("https://example.com/").unwrap();
907        let mut f = Frontier::new(&seed);
908        assert!(!f.try_enqueue(seed, 0));
909        let other = Url::parse("https://example.com/page").unwrap();
910        assert!(f.try_enqueue(other.clone(), 1));
911        assert!(!f.try_enqueue(other, 1));
912    }
913
914    #[test]
915    fn frontier_pop_and_pending() {
916        let seed = Url::parse("https://example.com/").unwrap();
917        let mut f = Frontier::new(&seed);
918        assert_eq!(f.pending(), 1);
919        let (url, depth) = f.pop().unwrap();
920        assert_eq!(url.as_str(), "https://example.com/");
921        assert_eq!(depth, 0);
922        assert_eq!(f.pending(), 0);
923        assert!(f.pop().is_none());
924    }
925
926    #[test]
927    fn extract_links_filters_dangerous_schemes() {
928        let html = r#"<a href="https://example.com/a">A</a>
929            <a href="javascript:void(0)">JS</a>
930            <a href="JAVASCRIPT:alert(1)">JS upper</a>
931            <a href="data:text/html,<h1>hi</h1>">Data</a>
932            <a href="mailto:x@y.com">Mail</a>
933            <a href="/relative">Rel</a>"#;
934        let base = Url::parse("https://example.com/").unwrap();
935        let links = extract_links_from_html(html, &base);
936        assert_eq!(links.len(), 2);
937        assert_eq!(links[0].as_str(), "https://example.com/a");
938        assert_eq!(links[1].as_str(), "https://example.com/relative");
939    }
940
941    #[test]
942    fn error_result_fields() {
943        let url = Url::parse("https://example.com/fail").unwrap();
944        let r = error_result(&url, 2, "timeout".into());
945        assert!(matches!(r.status, CrawlStatus::Error));
946        assert_eq!(r.error.as_deref(), Some("timeout"));
947        assert!(r.content.is_none());
948    }
949
950    #[test]
951    fn content_hash_dedup() {
952        let seed = Url::parse("https://example.com/").unwrap();
953        let mut f = Frontier::new(&seed);
954        assert!(!f.is_duplicate_content("unique content"));
955        assert!(f.is_duplicate_content("unique content"));
956        assert!(!f.is_duplicate_content("different content"));
957    }
958}
servo_fetch/crawl.rs

servo_fetch/
crawl.rs