servo_fetch/
crawl.rs

1//! Site crawling — BFS link traversal with scope, robots.txt, and rate limiting.
2
3use std::collections::{HashSet, VecDeque};
4use std::hash::{DefaultHasher, Hash, Hasher};
5use std::time::{Duration, SystemTime};
6
7use tokio::task::{JoinSet, spawn_blocking};
8use tokio::time::{MissedTickBehavior, interval};
9use url::Url;
10
11use crate::bridge::{self, PageFetcher};
12use crate::net;
13use crate::robots::RobotsPolicy;
14use crate::scope::{is_same_site, matches_scope, normalize_url};
15
16const MAX_HTML_BYTES: usize = 2 * 1024 * 1024;
17
18/// Options for crawling a site.
19#[must_use = "options do nothing until passed to crawl() or crawl_each()"]
20#[derive(Debug, Clone)]
21pub struct CrawlOptions {
22    pub(crate) url: String,
23    pub(crate) limit: usize,
24    pub(crate) max_depth: usize,
25    pub(crate) timeout: Duration,
26    pub(crate) settle: Duration,
27    pub(crate) include: Vec<String>,
28    pub(crate) exclude: Vec<String>,
29    pub(crate) selector: Option<String>,
30    pub(crate) json: bool,
31    pub(crate) user_agent: Option<String>,
32    pub(crate) concurrency: usize,
33    pub(crate) delay: Option<Duration>,
34}
35
36impl CrawlOptions {
37    /// Create crawl options for the given seed URL.
38    pub fn new(url: &str) -> Self {
39        Self {
40            url: url.into(),
41            limit: 50,
42            max_depth: 3,
43            timeout: Duration::from_secs(30),
44            settle: Duration::ZERO,
45            include: Vec::new(),
46            exclude: Vec::new(),
47            selector: None,
48            json: false,
49            user_agent: None,
50            concurrency: 1,
51            delay: Some(Duration::from_millis(500)),
52        }
53    }
54
55    /// Maximum number of pages to crawl (default: 50).
56    pub fn limit(mut self, n: usize) -> Self {
57        self.limit = n;
58        self
59    }
60
61    /// Maximum link depth from the seed URL (default: 3).
62    pub fn max_depth(mut self, n: usize) -> Self {
63        self.max_depth = n;
64        self
65    }
66
67    /// Page load timeout per page (default: 30s).
68    pub fn timeout(mut self, timeout: Duration) -> Self {
69        self.timeout = timeout;
70        self
71    }
72
73    /// Extra wait after load event per page (default: 0).
74    pub fn settle(mut self, settle: Duration) -> Self {
75        self.settle = settle;
76        self
77    }
78
79    /// URL path glob patterns to include (e.g. `"/docs/**"`).
80    pub fn include(mut self, patterns: &[&str]) -> Self {
81        self.include = patterns.iter().map(|s| (*s).to_string()).collect();
82        self
83    }
84
85    /// URL path glob patterns to exclude (e.g. `"/docs/archive/**"`).
86    pub fn exclude(mut self, patterns: &[&str]) -> Self {
87        self.exclude = patterns.iter().map(|s| (*s).to_string()).collect();
88        self
89    }
90
91    /// Output crawled content as JSON instead of Markdown.
92    pub fn json(mut self, json: bool) -> Self {
93        self.json = json;
94        self
95    }
96
97    /// CSS selector to extract a specific section per page.
98    pub fn selector(mut self, selector: impl Into<String>) -> Self {
99        self.selector = Some(selector.into());
100        self
101    }
102
103    /// Override the User-Agent string for all pages in this crawl.
104    pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
105        self.user_agent = Some(net::sanitize_user_agent(ua.into()));
106        self
107    }
108
109    /// Maximum parallel fetches (default: 1). Values below 1 are clamped to 1.
110    /// Results are yielded in completion order when greater than 1.
111    pub fn concurrency(mut self, n: usize) -> Self {
112        self.concurrency = n.max(1);
113        self
114    }
115
116    /// Minimum dispatch interval (default: `Some(500ms)`). `None` disables rate limiting.
117    pub fn delay(mut self, delay: Option<Duration>) -> Self {
118        self.delay = delay;
119        self
120    }
121}
122
123/// Result for a single crawled page.
124#[derive(Debug)]
125#[non_exhaustive]
126pub struct CrawlResult {
127    /// URL of the crawled page.
128    pub url: String,
129    /// Link depth from the seed URL.
130    pub depth: usize,
131    /// Wall-clock time when the fetch completed.
132    pub fetched_at: SystemTime,
133    /// Page content if successful, or error if failed.
134    pub outcome: Result<CrawlPage, crate::error::Error>,
135}
136
137/// Successfully crawled page.
138#[derive(Debug, Clone)]
139pub struct CrawlPage {
140    /// Page title.
141    pub title: Option<String>,
142    /// Extracted content (Markdown or JSON depending on options).
143    pub content: String,
144    /// Number of links discovered on this page.
145    pub links_found: usize,
146}
147
148impl serde::Serialize for CrawlResult {
149    fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
150        use serde::ser::SerializeMap;
151        let fetched_at = humantime::format_rfc3339_millis(self.fetched_at).to_string();
152        match &self.outcome {
153            Ok(page) => {
154                let mut map = serializer.serialize_map(None)?;
155                map.serialize_entry("type", "page")?;
156                map.serialize_entry("url", &self.url)?;
157                map.serialize_entry("depth", &self.depth)?;
158                map.serialize_entry("fetched_at", &fetched_at)?;
159                if let Some(t) = &page.title {
160                    map.serialize_entry("title", t)?;
161                }
162                map.serialize_entry("content", &page.content)?;
163                map.serialize_entry("links_found", &page.links_found)?;
164                map.end()
165            }
166            Err(e) => {
167                let mut map = serializer.serialize_map(None)?;
168                map.serialize_entry("type", "error")?;
169                map.serialize_entry("url", &self.url)?;
170                map.serialize_entry("depth", &self.depth)?;
171                map.serialize_entry("fetched_at", &fetched_at)?;
172                map.serialize_entry("error", &e.to_string())?;
173                map.end()
174            }
175        }
176    }
177}
178
179impl CrawlResult {
180    fn from_internal(r: CrawlPageResult) -> Self {
181        let outcome = match r.status {
182            CrawlStatus::Ok => Ok(CrawlPage {
183                title: r.title,
184                content: r.content.unwrap_or_default(),
185                links_found: r.links_found,
186            }),
187            CrawlStatus::Error => Err(r
188                .error
189                .unwrap_or_else(|| crate::error::Error::engine("unknown crawl error", None))),
190        };
191        Self {
192            url: r.url,
193            depth: r.depth,
194            fetched_at: r.fetched_at,
195            outcome,
196        }
197    }
198}
199
200/// Crawl a site, invoking `on_page` for each result as it arrives (blocking).
201pub fn crawl_each_blocking<F>(opts: &CrawlOptions, on_page: F) -> crate::error::Result<()>
202where
203    F: FnMut(CrawlResult) + Send,
204{
205    crate::runtime::block_on(crawl_each(opts, on_page)).map_err(|e| crate::error::Error::engine(e, None))?
206}
207
208/// Crawl a site, invoking `on_page` for each result as it arrives.
209pub async fn crawl_each<F>(opts: &CrawlOptions, mut on_page: F) -> crate::error::Result<()>
210where
211    F: FnMut(CrawlResult) + Send,
212{
213    net::ensure_crypto_provider();
214    let plan = build_crawl_plan(opts)?;
215    let robots = spawn_blocking({
216        let seed = plan.seed.clone();
217        let user_agent = plan.user_agent.clone();
218        let timeout = Duration::from_secs(plan.timeout_secs);
219        move || crate::robots::RobotsRules::fetch(&seed, user_agent.as_deref(), timeout)
220    })
221    .await
222    .unwrap_or(RobotsPolicy::Unreachable);
223    run(plan, robots, &bridge::ServoFetcher, |r| {
224        on_page(CrawlResult::from_internal(r));
225    })
226    .await;
227    Ok(())
228}
229
230/// Crawl a site and collect all results (blocking).
231pub fn crawl_blocking(opts: &CrawlOptions) -> crate::error::Result<Vec<CrawlResult>> {
232    let mut results = Vec::new();
233    crawl_each_blocking(opts, |r| results.push(r))?;
234    Ok(results)
235}
236
237/// Crawl a site and collect all results.
238pub async fn crawl(opts: &CrawlOptions) -> crate::error::Result<Vec<CrawlResult>> {
239    let mut results = Vec::new();
240    crawl_each(opts, |r| results.push(r)).await?;
241    Ok(results)
242}
243
244fn build_crawl_plan(opts: &CrawlOptions) -> crate::error::Result<CrawlPlan> {
245    let seed = net::validate_url(&opts.url)?;
246    let include = if opts.include.is_empty() {
247        None
248    } else {
249        Some(crate::scope::build_globset(&opts.include)?)
250    };
251    let exclude = if opts.exclude.is_empty() {
252        None
253    } else {
254        Some(crate::scope::build_globset(&opts.exclude)?)
255    };
256    Ok(CrawlPlan {
257        seed,
258        limit: opts.limit,
259        max_depth: opts.max_depth,
260        timeout_secs: opts.timeout.as_secs().max(1),
261        settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
262        include,
263        exclude,
264        selector: opts.selector.clone(),
265        json: opts.json,
266        user_agent: opts.user_agent.clone(),
267        concurrency: opts.concurrency,
268        delay: opts.delay,
269    })
270}
271
272/// Crawl configuration.
273pub(crate) struct CrawlPlan {
274    pub seed: Url,
275    pub limit: usize,
276    pub max_depth: usize,
277    pub timeout_secs: u64,
278    pub settle_ms: u64,
279    pub include: Option<globset::GlobSet>,
280    pub exclude: Option<globset::GlobSet>,
281    pub selector: Option<String>,
282    pub json: bool,
283    pub user_agent: Option<String>,
284    /// Parallel fetch limit (clamped to >=1; yields in completion order when >1).
285    pub concurrency: usize,
286    /// Dispatch interval; `None` disables rate limiting.
287    pub delay: Option<Duration>,
288}
289
290/// Result for a single crawled page.
291pub(crate) struct CrawlPageResult {
292    pub url: String,
293    pub depth: usize,
294    pub status: CrawlStatus,
295    pub title: Option<String>,
296    pub content: Option<String>,
297    pub error: Option<crate::error::Error>,
298    pub links_found: usize,
299    pub fetched_at: SystemTime,
300}
301
302/// Status of a crawled page.
303pub(crate) enum CrawlStatus {
304    Ok,
305    Error,
306}
307
308struct Frontier {
309    queue: VecDeque<(Url, usize)>,
310    visited: HashSet<String>,
311    content_hashes: HashSet<u64>,
312}
313
314impl Frontier {
315    fn new(seed: &Url) -> Self {
316        Self {
317            queue: VecDeque::from([(seed.clone(), 0)]),
318            visited: HashSet::from([normalize_url(seed)]),
319            content_hashes: HashSet::new(),
320        }
321    }
322
323    fn try_enqueue(&mut self, url: Url, depth: usize) -> bool {
324        if self.visited.insert(normalize_url(&url)) {
325            self.queue.push_back((url, depth));
326            true
327        } else {
328            false
329        }
330    }
331
332    fn pop(&mut self) -> Option<(Url, usize)> {
333        self.queue.pop_front()
334    }
335
336    fn is_duplicate_content(&mut self, content: &str) -> bool {
337        let mut h = DefaultHasher::new();
338        content.hash(&mut h);
339        !self.content_hashes.insert(h.finish())
340    }
341
342    fn pending(&self) -> usize {
343        self.queue.len()
344    }
345}
346
347fn extract_links_from_html(html: &str, base: &Url) -> Vec<Url> {
348    dom_query::Document::from(html)
349        .select("a[href]")
350        .iter()
351        .filter_map(|el| {
352            let href = el.attr("href")?;
353            let href = href.trim();
354            if href.is_empty() {
355                return None;
356            }
357            let resolved = base.join(href).ok()?;
358            matches!(resolved.scheme(), "http" | "https").then_some(resolved)
359        })
360        .collect()
361}
362
363pub(crate) async fn run(
364    opts: CrawlPlan,
365    robots: RobotsPolicy,
366    fetcher: &(impl PageFetcher + Clone),
367    mut on_page: impl FnMut(CrawlPageResult),
368) {
369    let mut frontier = Frontier::new(&opts.seed);
370    let mut completed: usize = 0;
371    let mut in_flight: JoinSet<FetchOutcome> = JoinSet::new();
372
373    // `Delay` keeps the steady-state rate correct after fetches exceed `delay`.
374    let mut ticker = opts.delay.map(|period| {
375        let mut t = interval(period);
376        t.set_missed_tick_behavior(MissedTickBehavior::Delay);
377        t
378    });
379
380    let concurrency = opts.concurrency.max(1);
381
382    loop {
383        while in_flight.len() < concurrency && completed + in_flight.len() < opts.limit {
384            let Some((url, depth)) = frontier.pop() else {
385                break;
386            };
387            if let Some(t) = ticker.as_mut() {
388                t.tick().await;
389            }
390            spawn_fetch(&mut in_flight, fetcher, &opts, url, depth);
391        }
392
393        let outcome = match in_flight.join_next().await {
394            None => break,
395            Some(Ok(o)) => o,
396            Some(Err(e)) if e.is_panic() => {
397                tracing::error!(err = %e, "crawl fetch task panicked");
398                continue;
399            }
400            Some(Err(e)) => {
401                tracing::warn!(err = %e, "crawl fetch task cancelled");
402                continue;
403            }
404        };
405
406        let FetchOutcome {
407            url,
408            depth,
409            result,
410            fetched_at,
411        } = outcome;
412        let page = match result {
413            Ok(p) => p,
414            Err(err) => {
415                on_page(error_result(&url, depth, err, fetched_at));
416                completed += 1;
417                continue;
418            }
419        };
420
421        let budget_used = completed + in_flight.len() + 1;
422        let mut ctx = CrawlContext {
423            frontier: &mut frontier,
424            robots: &robots,
425            opts: &opts,
426        };
427        if let Some(r) = process_ok_fetch(&mut ctx, &url, depth, &page, budget_used, fetched_at) {
428            on_page(r);
429            completed += 1;
430        }
431    }
432}
433
434fn spawn_fetch(
435    in_flight: &mut JoinSet<FetchOutcome>,
436    fetcher: &(impl PageFetcher + Clone),
437    opts: &CrawlPlan,
438    url: Url,
439    depth: usize,
440) {
441    let url_str = url.to_string();
442    let timeout = opts.timeout_secs;
443    let settle = opts.settle_ms;
444    let user_agent = opts.user_agent.clone();
445    let f = fetcher.clone();
446    in_flight.spawn_blocking(move || {
447        let result = f
448            .fetch_page(bridge::FetchOptions {
449                url: &url_str,
450                timeout_secs: timeout,
451                settle_ms: settle,
452                mode: bridge::FetchMode::Content { include_a11y: false },
453                user_agent: user_agent.as_deref(),
454            })
455            .map_err(|e| crate::error::Error::engine(e, Some(url_str.clone())));
456        FetchOutcome {
457            url,
458            depth,
459            result,
460            fetched_at: SystemTime::now(),
461        }
462    });
463}
464
465/// Stable crawl state passed to `process_ok_fetch`.
466struct CrawlContext<'a> {
467    frontier: &'a mut Frontier,
468    robots: &'a RobotsPolicy,
469    opts: &'a CrawlPlan,
470}
471
472/// Build a `CrawlPageResult` and enqueue discovered links.
473fn process_ok_fetch(
474    ctx: &mut CrawlContext<'_>,
475    url: &Url,
476    depth: usize,
477    page: &bridge::ServoPage,
478    budget_used: usize,
479    fetched_at: SystemTime,
480) -> Option<CrawlPageResult> {
481    let html = if page.html.len() > MAX_HTML_BYTES {
482        &page.html[..crate::sanitize::floor_char_boundary(&page.html, MAX_HTML_BYTES)]
483    } else {
484        &page.html
485    };
486
487    let input = crate::extract::ExtractInput::new(html, url.as_str())
488        .with_layout_json(page.layout_json.as_deref())
489        .with_inner_text(page.inner_text.as_deref())
490        .with_selector(ctx.opts.selector.as_deref());
491
492    let content = if ctx.opts.json {
493        crate::extract::extract_json(&input).ok()
494    } else {
495        crate::extract::extract_text(&input).ok()
496    };
497
498    if content.as_ref().is_some_and(|c| ctx.frontier.is_duplicate_content(c)) {
499        return None;
500    }
501
502    let links = extract_links_from_html(html, url);
503    let links_found = links.len();
504
505    if depth < ctx.opts.max_depth {
506        for link in &links {
507            if budget_used + ctx.frontier.pending() >= ctx.opts.limit {
508                break;
509            }
510            if !is_same_site(&ctx.opts.seed, link)
511                || net::validate_url_with_policy(link.as_str(), bridge::engine_policy()).is_err()
512                || !ctx.robots.is_allowed(link)
513                || !matches_scope(link, ctx.opts.include.as_ref(), ctx.opts.exclude.as_ref())
514            {
515                continue;
516            }
517            ctx.frontier.try_enqueue(link.clone(), depth + 1);
518        }
519    }
520
521    let title = {
522        let doc = dom_query::Document::from(html);
523        let t = doc.select("title").text().to_string();
524        (!t.is_empty()).then_some(t)
525    };
526
527    Some(CrawlPageResult {
528        url: url.to_string(),
529        depth,
530        status: CrawlStatus::Ok,
531        title,
532        content: content.map(|c| crate::sanitize::sanitize(&c).into_owned()),
533        error: None,
534        links_found,
535        fetched_at,
536    })
537}
538
539/// Fetch result crossing the `JoinSet` boundary.
540struct FetchOutcome {
541    url: Url,
542    depth: usize,
543    result: Result<bridge::ServoPage, crate::error::Error>,
544    fetched_at: SystemTime,
545}
546
547fn error_result(url: &Url, depth: usize, error: crate::error::Error, fetched_at: SystemTime) -> CrawlPageResult {
548    CrawlPageResult {
549        url: url.to_string(),
550        depth,
551        status: CrawlStatus::Error,
552        title: None,
553        content: None,
554        error: Some(error),
555        links_found: 0,
556        fetched_at,
557    }
558}
559
560#[cfg(test)]
561mod tests {
562    use std::collections::HashMap;
563    use std::sync::Arc;
564
565    use super::*;
566
567    #[test]
568    fn crawl_options_defaults() {
569        let opts = CrawlOptions::new("https://example.com");
570        assert_eq!(opts.url, "https://example.com");
571        assert_eq!(opts.limit, 50);
572        assert_eq!(opts.max_depth, 3);
573        assert_eq!(opts.timeout, Duration::from_secs(30));
574        assert!(opts.include.is_empty());
575        assert!(opts.exclude.is_empty());
576        assert_eq!(opts.concurrency, 1);
577        assert_eq!(opts.delay, Some(Duration::from_millis(500)));
578    }
579
580    #[test]
581    fn crawl_options_chaining() {
582        let opts = CrawlOptions::new("https://example.com")
583            .limit(100)
584            .max_depth(5)
585            .timeout(Duration::from_secs(60))
586            .include(&["/docs/**"])
587            .exclude(&["/docs/archive/**"])
588            .concurrency(4)
589            .delay(None);
590        assert_eq!(opts.limit, 100);
591        assert_eq!(opts.max_depth, 5);
592        assert_eq!(opts.include, vec!["/docs/**"]);
593        assert_eq!(opts.exclude, vec!["/docs/archive/**"]);
594        assert_eq!(opts.concurrency, 4);
595        assert_eq!(opts.delay, None);
596    }
597
598    #[test]
599    fn crawl_options_concurrency_clamps_below_one() {
600        let opts = CrawlOptions::new("https://example.com").concurrency(0);
601        assert_eq!(opts.concurrency, 1);
602    }
603
604    #[test]
605    fn crawl_options_delay_custom_value() {
606        let opts = CrawlOptions::new("https://example.com").delay(Some(Duration::from_secs(2)));
607        assert_eq!(opts.delay, Some(Duration::from_secs(2)));
608    }
609
610    #[test]
611    fn crawl_user_agent_sanitizes_crlf() {
612        let opts = CrawlOptions::new("https://example.com").user_agent("Crawler\r\n/2.0");
613        assert_eq!(opts.user_agent.as_deref(), Some("Crawler  /2.0"));
614    }
615
616    #[derive(Clone)]
617    struct MockFetcher(Arc<HashMap<String, String>>);
618
619    impl MockFetcher {
620        fn new(pages: &[(&str, &str)]) -> Self {
621            Self(Arc::new(
622                pages.iter().map(|(u, h)| (u.to_string(), h.to_string())).collect(),
623            ))
624        }
625    }
626
627    impl PageFetcher for MockFetcher {
628        fn fetch_page(&self, opts: bridge::FetchOptions<'_>) -> anyhow::Result<bridge::ServoPage> {
629            self.0
630                .get(opts.url)
631                .map(|html| bridge::ServoPage {
632                    html: html.clone(),
633                    ..Default::default()
634                })
635                .ok_or_else(|| anyhow::anyhow!("not found: {}", opts.url))
636        }
637    }
638
639    fn page(links: &[&str]) -> String {
640        use std::fmt::Write as _;
641        let mut anchors = String::new();
642        for l in links {
643            write!(anchors, r#"<a href="{l}">link</a>"#).unwrap();
644        }
645        format!("<html><head><title>Test</title></head><body>{anchors}</body></html>")
646    }
647
648    /// Leaf page with unique body to avoid content-hash dedup.
649    fn distinct_page(tag: &str) -> String {
650        format!("<html><head><title>{tag}</title></head><body>page {tag}</body></html>")
651    }
652
653    /// Test helper: build `CrawlPlan`, run, assert. `delay=None` keeps tests fast.
654    async fn check(
655        pages: &[(&str, &str)],
656        configure: impl FnOnce(&mut CrawlPlan),
657        assert: impl FnOnce(&[CrawlPageResult]),
658    ) {
659        let fetcher = MockFetcher::new(pages);
660        let seed = pages[0].0;
661        let mut opts = CrawlPlan {
662            seed: Url::parse(seed).unwrap(),
663            limit: 50,
664            max_depth: 3,
665            timeout_secs: 30,
666            settle_ms: 0,
667            include: None,
668            exclude: None,
669            selector: None,
670            json: false,
671            user_agent: None,
672            concurrency: 1,
673            delay: None,
674        };
675        configure(&mut opts);
676        let mut results = Vec::new();
677        run(opts, RobotsPolicy::Unavailable, &fetcher, |r| results.push(r)).await;
678        assert(&results);
679    }
680
681    #[tokio::test]
682    async fn crawl_single_page() {
683        check(
684            &[("https://example.com/", &page(&[]))],
685            |_| {},
686            |r| {
687                assert_eq!(r.len(), 1);
688                assert_eq!(r[0].url, "https://example.com/");
689            },
690        )
691        .await;
692    }
693
694    #[tokio::test]
695    async fn crawl_follows_links() {
696        check(
697            &[
698                ("https://example.com/", &page(&["/a", "/b"])),
699                (
700                    "https://example.com/a",
701                    "<html><head><title>A</title></head><body>page a</body></html>",
702                ),
703                (
704                    "https://example.com/b",
705                    "<html><head><title>B</title></head><body>page b</body></html>",
706                ),
707            ],
708            |_| {},
709            |r| assert_eq!(r.len(), 3),
710        )
711        .await;
712    }
713
714    #[tokio::test]
715    async fn crawl_respects_depth_limit() {
716        check(
717            &[
718                ("https://example.com/", &page(&["/a"])),
719                ("https://example.com/a", &page(&["/b"])),
720                ("https://example.com/b", &page(&["/c"])),
721                ("https://example.com/c", &page(&[])),
722            ],
723            |o| o.max_depth = 1,
724            |r| assert_eq!(r.len(), 2),
725        )
726        .await;
727    }
728
729    #[tokio::test]
730    async fn crawl_respects_limit() {
731        check(
732            &[
733                ("https://example.com/", &page(&["/a", "/b", "/c"])),
734                ("https://example.com/a", &page(&[])),
735                ("https://example.com/b", &page(&[])),
736                ("https://example.com/c", &page(&[])),
737            ],
738            |o| o.limit = 2,
739            |r| assert_eq!(r.len(), 2),
740        )
741        .await;
742    }
743
744    #[tokio::test]
745    async fn crawl_skips_cross_site_links() {
746        check(
747            &[
748                ("https://example.com/", &page(&["https://other.com/x"])),
749                ("https://other.com/x", &page(&[])),
750            ],
751            |_| {},
752            |r| assert_eq!(r.len(), 1),
753        )
754        .await;
755    }
756
757    #[tokio::test]
758    async fn crawl_deduplicates_urls() {
759        check(
760            &[
761                ("https://example.com/", &page(&["/a", "/a", "/a"])),
762                ("https://example.com/a", &page(&["/"])),
763            ],
764            |_| {},
765            |r| assert_eq!(r.len(), 2),
766        )
767        .await;
768    }
769
770    #[tokio::test]
771    async fn crawl_handles_fetch_errors() {
772        check(
773            &[("https://example.com/", &page(&["/missing"]))],
774            |_| {},
775            |r| {
776                assert_eq!(r.len(), 2);
777                assert!(matches!(r[1].status, CrawlStatus::Error));
778                assert!(r[1].error.is_some());
779            },
780        )
781        .await;
782    }
783
784    #[tokio::test]
785    async fn crawl_applies_include_glob() {
786        check(
787            &[
788                ("https://example.com/", &page(&["/docs/a", "/blog/b"])),
789                ("https://example.com/docs/a", &page(&[])),
790                ("https://example.com/blog/b", &page(&[])),
791            ],
792            |o| o.include = Some(crate::scope::build_globset(&["/docs/**".into()]).unwrap()),
793            |r| {
794                assert_eq!(r.len(), 2);
795                assert!(r.iter().any(|p| p.url == "https://example.com/docs/a"));
796                assert!(!r.iter().any(|p| p.url == "https://example.com/blog/b"));
797            },
798        )
799        .await;
800    }
801
802    #[tokio::test]
803    async fn crawl_applies_exclude_glob() {
804        check(
805            &[
806                ("https://example.com/", &page(&["/public", "/secret/data"])),
807                ("https://example.com/public", &page(&[])),
808                ("https://example.com/secret/data", &page(&[])),
809            ],
810            |o| o.exclude = Some(crate::scope::build_globset(&["/secret/**".into()]).unwrap()),
811            |r| {
812                assert_eq!(r.len(), 2);
813                assert!(!r.iter().any(|p| p.url == "https://example.com/secret/data"));
814            },
815        )
816        .await;
817    }
818
819    #[tokio::test]
820    async fn crawl_deduplicates_content() {
821        let same = "<html><head><title>Same</title></head><body>identical</body></html>";
822        check(
823            &[
824                ("https://example.com/", &page(&["/a", "/b"])),
825                ("https://example.com/a", same),
826                ("https://example.com/b", same),
827            ],
828            |_| {},
829            |r| assert_eq!(r.len(), 2),
830        )
831        .await;
832    }
833
834    #[tokio::test]
835    async fn crawl_concurrency_visits_all_pages() {
836        check(
837            &[
838                ("https://example.com/", &page(&["/a", "/b", "/c", "/d"])),
839                ("https://example.com/a", &distinct_page("a")),
840                ("https://example.com/b", &distinct_page("b")),
841                ("https://example.com/c", &distinct_page("c")),
842                ("https://example.com/d", &distinct_page("d")),
843            ],
844            |o| o.concurrency = 4,
845            |r| {
846                assert_eq!(r.len(), 5);
847                let urls: HashSet<&str> = r.iter().map(|p| p.url.as_str()).collect();
848                for u in [
849                    "https://example.com/",
850                    "https://example.com/a",
851                    "https://example.com/b",
852                    "https://example.com/c",
853                    "https://example.com/d",
854                ] {
855                    assert!(urls.contains(u), "missing {u}");
856                }
857            },
858        )
859        .await;
860    }
861
862    #[tokio::test]
863    async fn crawl_concurrency_respects_limit() {
864        check(
865            &[
866                ("https://example.com/", &page(&["/a", "/b", "/c", "/d"])),
867                ("https://example.com/a", &distinct_page("a")),
868                ("https://example.com/b", &distinct_page("b")),
869                ("https://example.com/c", &distinct_page("c")),
870                ("https://example.com/d", &distinct_page("d")),
871            ],
872            |o| {
873                o.concurrency = 4;
874                o.limit = 3;
875            },
876            |r| assert_eq!(r.len(), 3),
877        )
878        .await;
879    }
880
881    #[tokio::test]
882    async fn crawl_concurrency_one_preserves_bfs_order() {
883        check(
884            &[
885                ("https://example.com/", &page(&["/a", "/b"])),
886                ("https://example.com/a", &distinct_page("a")),
887                ("https://example.com/b", &distinct_page("b")),
888            ],
889            |o| o.concurrency = 1,
890            |r| {
891                assert_eq!(r.len(), 3);
892                assert_eq!(r[0].url, "https://example.com/");
893                assert_eq!(r[1].url, "https://example.com/a");
894                assert_eq!(r[2].url, "https://example.com/b");
895            },
896        )
897        .await;
898    }
899
900    #[tokio::test(start_paused = true)]
901    async fn crawl_delay_enforces_minimum_interval() {
902        // 3 pages at 500ms delay = 2 ticks >= 1s (first dispatch is free).
903        let start = tokio::time::Instant::now();
904        check(
905            &[
906                ("https://example.com/", &page(&["/a", "/b"])),
907                ("https://example.com/a", &distinct_page("a")),
908                ("https://example.com/b", &distinct_page("b")),
909            ],
910            |o| {
911                o.concurrency = 1;
912                o.delay = Some(Duration::from_millis(500));
913            },
914            |r| assert_eq!(r.len(), 3),
915        )
916        .await;
917        let elapsed = start.elapsed();
918        assert!(
919            elapsed >= Duration::from_secs(1),
920            "expected >= 1s for 3 pages with 500ms delay, got {elapsed:?}"
921        );
922    }
923
924    #[test]
925    fn frontier_dedup() {
926        let seed = Url::parse("https://example.com/").unwrap();
927        let mut f = Frontier::new(&seed);
928        assert!(!f.try_enqueue(seed, 0));
929        let other = Url::parse("https://example.com/page").unwrap();
930        assert!(f.try_enqueue(other.clone(), 1));
931        assert!(!f.try_enqueue(other, 1));
932    }
933
934    #[test]
935    fn frontier_pop_and_pending() {
936        let seed = Url::parse("https://example.com/").unwrap();
937        let mut f = Frontier::new(&seed);
938        assert_eq!(f.pending(), 1);
939        let (url, depth) = f.pop().unwrap();
940        assert_eq!(url.as_str(), "https://example.com/");
941        assert_eq!(depth, 0);
942        assert_eq!(f.pending(), 0);
943        assert!(f.pop().is_none());
944    }
945
946    #[test]
947    fn extract_links_filters_dangerous_schemes() {
948        let html = r#"<a href="https://example.com/a">A</a>
949            <a href="javascript:void(0)">JS</a>
950            <a href="JAVASCRIPT:alert(1)">JS upper</a>
951            <a href="data:text/html,<h1>hi</h1>">Data</a>
952            <a href="mailto:x@y.com">Mail</a>
953            <a href="/relative">Rel</a>"#;
954        let base = Url::parse("https://example.com/").unwrap();
955        let links = extract_links_from_html(html, &base);
956        assert_eq!(links.len(), 2);
957        assert_eq!(links[0].as_str(), "https://example.com/a");
958        assert_eq!(links[1].as_str(), "https://example.com/relative");
959    }
960
961    #[test]
962    fn error_result_fields() {
963        let url = Url::parse("https://example.com/fail").unwrap();
964        let r = error_result(&url, 2, crate::error::Error::engine("timeout", None), SystemTime::now());
965        assert!(matches!(r.status, CrawlStatus::Error));
966        assert!(r.error.as_ref().is_some_and(|e| e.to_string().contains("timeout")));
967        assert!(r.content.is_none());
968    }
969
970    #[test]
971    fn content_hash_dedup() {
972        let seed = Url::parse("https://example.com/").unwrap();
973        let mut f = Frontier::new(&seed);
974        assert!(!f.is_duplicate_content("unique content"));
975        assert!(f.is_duplicate_content("unique content"));
976        assert!(!f.is_duplicate_content("different content"));
977    }
978}
servo_fetch/crawl.rs

servo_fetch/
crawl.rs