servo_fetch/
crawl.rs

1//! Site crawling — BFS link traversal with scope, robots.txt, and rate limiting.
2
3use std::collections::{HashSet, VecDeque};
4use std::hash::{Hash, Hasher};
5use std::time::{Duration, SystemTime};
6
7use tokio::task::JoinSet;
8use tokio::time::{MissedTickBehavior, interval};
9use url::Url;
10
11use crate::bridge::{self, PageFetcher};
12use crate::net;
13use crate::robots::RobotsPolicy;
14use crate::scope::{is_same_site, matches_scope, normalize_url};
15
16const MAX_HTML_BYTES: usize = 2 * 1024 * 1024;
17
18/// Options for crawling a site.
19#[must_use = "options do nothing until passed to crawl() or crawl_each()"]
20#[derive(Debug, Clone)]
21pub struct CrawlOptions {
22    pub(crate) url: String,
23    pub(crate) limit: usize,
24    pub(crate) max_depth: usize,
25    pub(crate) timeout: Duration,
26    pub(crate) settle: Duration,
27    pub(crate) include: Vec<String>,
28    pub(crate) exclude: Vec<String>,
29    pub(crate) selector: Option<String>,
30    pub(crate) json: bool,
31    pub(crate) user_agent: Option<String>,
32    pub(crate) concurrency: usize,
33    pub(crate) delay: Option<Duration>,
34}
35
36impl CrawlOptions {
37    /// Create crawl options for the given seed URL.
38    pub fn new(url: &str) -> Self {
39        Self {
40            url: url.into(),
41            limit: 50,
42            max_depth: 3,
43            timeout: Duration::from_secs(30),
44            settle: Duration::ZERO,
45            include: Vec::new(),
46            exclude: Vec::new(),
47            selector: None,
48            json: false,
49            user_agent: None,
50            concurrency: 1,
51            delay: Some(Duration::from_millis(500)),
52        }
53    }
54
55    /// Maximum number of pages to crawl (default: 50).
56    pub fn limit(mut self, n: usize) -> Self {
57        self.limit = n;
58        self
59    }
60
61    /// Maximum link depth from the seed URL (default: 3).
62    pub fn max_depth(mut self, n: usize) -> Self {
63        self.max_depth = n;
64        self
65    }
66
67    /// Page load timeout per page (default: 30s).
68    pub fn timeout(mut self, timeout: Duration) -> Self {
69        self.timeout = timeout;
70        self
71    }
72
73    /// Extra wait after load event per page (default: 0).
74    pub fn settle(mut self, settle: Duration) -> Self {
75        self.settle = settle;
76        self
77    }
78
79    /// URL path glob patterns to include (e.g. `"/docs/**"`).
80    pub fn include(mut self, patterns: &[&str]) -> Self {
81        self.include = patterns.iter().map(|s| (*s).to_string()).collect();
82        self
83    }
84
85    /// URL path glob patterns to exclude (e.g. `"/docs/archive/**"`).
86    pub fn exclude(mut self, patterns: &[&str]) -> Self {
87        self.exclude = patterns.iter().map(|s| (*s).to_string()).collect();
88        self
89    }
90
91    /// Output crawled content as JSON instead of Markdown.
92    pub fn json(mut self, json: bool) -> Self {
93        self.json = json;
94        self
95    }
96
97    /// CSS selector to extract a specific section per page.
98    pub fn selector(mut self, selector: impl Into<String>) -> Self {
99        self.selector = Some(selector.into());
100        self
101    }
102
103    /// Override the User-Agent string for all pages in this crawl.
104    pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
105        self.user_agent = Some(net::sanitize_user_agent(ua.into()));
106        self
107    }
108
109    /// Maximum parallel fetches (default: 1). Values below 1 are clamped to 1.
110    /// Results are yielded in completion order when greater than 1.
111    pub fn concurrency(mut self, n: usize) -> Self {
112        self.concurrency = n.max(1);
113        self
114    }
115
116    /// Minimum dispatch interval (default: `Some(500ms)`). `None` disables rate limiting.
117    pub fn delay(mut self, delay: Option<Duration>) -> Self {
118        self.delay = delay;
119        self
120    }
121}
122
123/// Result for a single crawled page.
124#[derive(Debug, Clone)]
125#[non_exhaustive]
126pub struct CrawlResult {
127    /// URL of the crawled page.
128    pub url: String,
129    /// Link depth from the seed URL.
130    pub depth: usize,
131    /// Wall-clock time when the fetch completed.
132    pub fetched_at: SystemTime,
133    /// Page content if successful, or error if failed.
134    pub outcome: Result<CrawlPage, CrawlError>,
135}
136
137/// Successfully crawled page.
138#[derive(Debug, Clone)]
139pub struct CrawlPage {
140    /// Page title.
141    pub title: Option<String>,
142    /// Extracted content (Markdown or JSON depending on options).
143    pub content: String,
144    /// Number of links discovered on this page.
145    pub links_found: usize,
146}
147
148/// Error from a failed crawl attempt.
149#[derive(Debug, Clone)]
150pub struct CrawlError {
151    /// Error message.
152    pub message: String,
153}
154
155impl std::fmt::Display for CrawlError {
156    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
157        f.write_str(&self.message)
158    }
159}
160
161impl std::error::Error for CrawlError {}
162
163impl serde::Serialize for CrawlResult {
164    fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
165        use serde::ser::SerializeMap;
166        let fetched_at = humantime::format_rfc3339_millis(self.fetched_at).to_string();
167        match &self.outcome {
168            Ok(page) => {
169                let mut map = serializer.serialize_map(None)?;
170                map.serialize_entry("type", "page")?;
171                map.serialize_entry("url", &self.url)?;
172                map.serialize_entry("depth", &self.depth)?;
173                map.serialize_entry("fetched_at", &fetched_at)?;
174                if let Some(t) = &page.title {
175                    map.serialize_entry("title", t)?;
176                }
177                map.serialize_entry("content", &page.content)?;
178                map.serialize_entry("links_found", &page.links_found)?;
179                map.end()
180            }
181            Err(e) => {
182                let mut map = serializer.serialize_map(None)?;
183                map.serialize_entry("type", "error")?;
184                map.serialize_entry("url", &self.url)?;
185                map.serialize_entry("depth", &self.depth)?;
186                map.serialize_entry("fetched_at", &fetched_at)?;
187                map.serialize_entry("error", &e.message)?;
188                map.end()
189            }
190        }
191    }
192}
193
194impl CrawlResult {
195    fn from_internal(r: &CrawlPageResult) -> Self {
196        let outcome = match r.status {
197            CrawlStatus::Ok => Ok(CrawlPage {
198                title: r.title.clone(),
199                content: r.content.clone().unwrap_or_default(),
200                links_found: r.links_found,
201            }),
202            CrawlStatus::Error => Err(CrawlError {
203                message: r.error.clone().unwrap_or_default(),
204            }),
205        };
206        Self {
207            url: r.url.clone(),
208            depth: r.depth,
209            fetched_at: r.fetched_at,
210            outcome,
211        }
212    }
213}
214
215/// Crawl a site, invoking `on_page` for each result as it arrives.
216#[allow(clippy::needless_pass_by_value)]
217pub fn crawl_each(opts: CrawlOptions, mut on_page: impl FnMut(&CrawlResult)) -> crate::error::Result<()> {
218    net::ensure_crypto_provider();
219    let plan = build_crawl_plan(&opts)?;
220    crate::runtime::block_on(async {
221        let robots = tokio::task::spawn_blocking({
222            let seed = plan.seed.clone();
223            let user_agent = plan.user_agent.clone();
224            let timeout = Duration::from_secs(plan.timeout_secs);
225            move || crate::robots::RobotsRules::fetch(&seed, user_agent.as_deref(), timeout)
226        })
227        .await
228        .unwrap_or(RobotsPolicy::Unreachable);
229        run(plan, robots, &bridge::ServoFetcher, |r| {
230            on_page(&CrawlResult::from_internal(r));
231        })
232        .await
233    })
234    .map_err(|e| crate::error::Error::Engine(e.to_string()))?;
235    Ok(())
236}
237
238/// Crawl a site and collect all results.
239#[allow(clippy::needless_pass_by_value)]
240pub fn crawl(opts: CrawlOptions) -> crate::error::Result<Vec<CrawlResult>> {
241    let mut results = Vec::new();
242    crawl_each(opts, |r| results.push(r.clone()))?;
243    Ok(results)
244}
245
246fn build_crawl_plan(opts: &CrawlOptions) -> crate::error::Result<CrawlPlan> {
247    let seed = net::validate_url(&opts.url)?;
248    let include = if opts.include.is_empty() {
249        None
250    } else {
251        Some(crate::scope::build_globset(&opts.include)?)
252    };
253    let exclude = if opts.exclude.is_empty() {
254        None
255    } else {
256        Some(crate::scope::build_globset(&opts.exclude)?)
257    };
258    Ok(CrawlPlan {
259        seed,
260        limit: opts.limit,
261        max_depth: opts.max_depth,
262        timeout_secs: opts.timeout.as_secs().max(1),
263        settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
264        include,
265        exclude,
266        selector: opts.selector.clone(),
267        json: opts.json,
268        user_agent: opts.user_agent.clone(),
269        concurrency: opts.concurrency,
270        delay: opts.delay,
271    })
272}
273
274/// Crawl configuration.
275pub(crate) struct CrawlPlan {
276    pub seed: Url,
277    pub limit: usize,
278    pub max_depth: usize,
279    pub timeout_secs: u64,
280    pub settle_ms: u64,
281    pub include: Option<globset::GlobSet>,
282    pub exclude: Option<globset::GlobSet>,
283    pub selector: Option<String>,
284    pub json: bool,
285    pub user_agent: Option<String>,
286    /// Parallel fetch limit (clamped to >=1; yields in completion order when >1).
287    pub concurrency: usize,
288    /// Dispatch interval; `None` disables rate limiting.
289    pub delay: Option<Duration>,
290}
291
292/// Result for a single crawled page.
293pub(crate) struct CrawlPageResult {
294    pub url: String,
295    pub depth: usize,
296    pub status: CrawlStatus,
297    pub title: Option<String>,
298    pub content: Option<String>,
299    pub error: Option<String>,
300    pub links_found: usize,
301    pub fetched_at: SystemTime,
302}
303
304/// Status of a crawled page.
305pub(crate) enum CrawlStatus {
306    Ok,
307    Error,
308}
309
310struct Frontier {
311    queue: VecDeque<(Url, usize)>,
312    visited: HashSet<String>,
313    content_hashes: HashSet<u64>,
314}
315
316impl Frontier {
317    fn new(seed: &Url) -> Self {
318        Self {
319            queue: VecDeque::from([(seed.clone(), 0)]),
320            visited: HashSet::from([normalize_url(seed)]),
321            content_hashes: HashSet::new(),
322        }
323    }
324
325    fn try_enqueue(&mut self, url: Url, depth: usize) -> bool {
326        if self.visited.insert(normalize_url(&url)) {
327            self.queue.push_back((url, depth));
328            true
329        } else {
330            false
331        }
332    }
333
334    fn pop(&mut self) -> Option<(Url, usize)> {
335        self.queue.pop_front()
336    }
337
338    fn is_duplicate_content(&mut self, content: &str) -> bool {
339        let mut h = std::hash::DefaultHasher::new();
340        content.hash(&mut h);
341        !self.content_hashes.insert(h.finish())
342    }
343
344    fn pending(&self) -> usize {
345        self.queue.len()
346    }
347}
348
349fn extract_links_from_html(html: &str, base: &Url) -> Vec<Url> {
350    dom_query::Document::from(html)
351        .select("a[href]")
352        .iter()
353        .filter_map(|el| {
354            let href = el.attr("href")?;
355            let href = href.trim();
356            if href.is_empty() {
357                return None;
358            }
359            let resolved = base.join(href).ok()?;
360            matches!(resolved.scheme(), "http" | "https").then_some(resolved)
361        })
362        .collect()
363}
364
365pub(crate) async fn run(
366    opts: CrawlPlan,
367    robots: RobotsPolicy,
368    fetcher: &(impl PageFetcher + Clone),
369    mut on_page: impl FnMut(&CrawlPageResult),
370) -> Vec<CrawlPageResult> {
371    let mut frontier = Frontier::new(&opts.seed);
372    let mut results = Vec::new();
373    let mut in_flight: JoinSet<FetchOutcome> = JoinSet::new();
374
375    // `Delay` keeps the steady-state rate correct after fetches exceed `delay`.
376    let mut ticker = opts.delay.map(|period| {
377        let mut t = interval(period);
378        t.set_missed_tick_behavior(MissedTickBehavior::Delay);
379        t
380    });
381
382    let concurrency = opts.concurrency.max(1);
383
384    loop {
385        while in_flight.len() < concurrency && results.len() + in_flight.len() < opts.limit {
386            let Some((url, depth)) = frontier.pop() else {
387                break;
388            };
389            if let Some(t) = ticker.as_mut() {
390                t.tick().await;
391            }
392            spawn_fetch(&mut in_flight, fetcher, &opts, url, depth);
393        }
394
395        let outcome = match in_flight.join_next().await {
396            None => break,
397            Some(Ok(o)) => o,
398            Some(Err(e)) if e.is_panic() => {
399                tracing::error!(err = %e, "crawl fetch task panicked");
400                continue;
401            }
402            Some(Err(e)) => {
403                tracing::warn!(err = %e, "crawl fetch task cancelled");
404                continue;
405            }
406        };
407
408        let FetchOutcome {
409            url,
410            depth,
411            result,
412            fetched_at,
413        } = outcome;
414        let page = match result {
415            Ok(p) => p,
416            Err(msg) => {
417                let r = error_result(&url, depth, msg, fetched_at);
418                on_page(&r);
419                results.push(r);
420                continue;
421            }
422        };
423
424        let budget_used = results.len() + in_flight.len() + 1;
425        let mut ctx = CrawlContext {
426            frontier: &mut frontier,
427            robots: &robots,
428            opts: &opts,
429        };
430        if let Some(r) = process_ok_fetch(&mut ctx, &url, depth, &page, budget_used, fetched_at) {
431            on_page(&r);
432            results.push(r);
433        }
434    }
435
436    results
437}
438
439fn spawn_fetch(
440    in_flight: &mut JoinSet<FetchOutcome>,
441    fetcher: &(impl PageFetcher + Clone),
442    opts: &CrawlPlan,
443    url: Url,
444    depth: usize,
445) {
446    let url_str = url.to_string();
447    let timeout = opts.timeout_secs;
448    let settle = opts.settle_ms;
449    let user_agent = opts.user_agent.clone();
450    let f = fetcher.clone();
451    in_flight.spawn_blocking(move || {
452        let result = f
453            .fetch_page(bridge::FetchOptions {
454                url: &url_str,
455                timeout_secs: timeout,
456                settle_ms: settle,
457                mode: bridge::FetchMode::Content { include_a11y: false },
458                user_agent: user_agent.as_deref(),
459            })
460            .map_err(|e| format!("{e:#}"));
461        FetchOutcome {
462            url,
463            depth,
464            result,
465            fetched_at: SystemTime::now(),
466        }
467    });
468}
469
470/// Stable crawl state passed to `process_ok_fetch`.
471struct CrawlContext<'a> {
472    frontier: &'a mut Frontier,
473    robots: &'a RobotsPolicy,
474    opts: &'a CrawlPlan,
475}
476
477/// Build a `CrawlPageResult` and enqueue discovered links.
478fn process_ok_fetch(
479    ctx: &mut CrawlContext<'_>,
480    url: &Url,
481    depth: usize,
482    page: &bridge::ServoPage,
483    budget_used: usize,
484    fetched_at: SystemTime,
485) -> Option<CrawlPageResult> {
486    let html = if page.html.len() > MAX_HTML_BYTES {
487        &page.html[..crate::sanitize::floor_char_boundary(&page.html, MAX_HTML_BYTES)]
488    } else {
489        &page.html
490    };
491
492    let input = crate::extract::ExtractInput::new(html, url.as_str())
493        .with_layout_json(page.layout_json.as_deref())
494        .with_inner_text(page.inner_text.as_deref())
495        .with_selector(ctx.opts.selector.as_deref());
496
497    let content = if ctx.opts.json {
498        crate::extract::extract_json(&input).ok()
499    } else {
500        crate::extract::extract_text(&input).ok()
501    };
502
503    if content.as_ref().is_some_and(|c| ctx.frontier.is_duplicate_content(c)) {
504        return None;
505    }
506
507    let links = extract_links_from_html(html, url);
508    let links_found = links.len();
509
510    if depth < ctx.opts.max_depth {
511        for link in &links {
512            if budget_used + ctx.frontier.pending() >= ctx.opts.limit {
513                break;
514            }
515            if !is_same_site(&ctx.opts.seed, link)
516                || net::validate_url_with_policy(link.as_str(), bridge::engine_policy()).is_err()
517                || !ctx.robots.is_allowed(link)
518                || !matches_scope(link, ctx.opts.include.as_ref(), ctx.opts.exclude.as_ref())
519            {
520                continue;
521            }
522            ctx.frontier.try_enqueue(link.clone(), depth + 1);
523        }
524    }
525
526    let title = {
527        let doc = dom_query::Document::from(html);
528        let t = doc.select("title").text().to_string();
529        (!t.is_empty()).then_some(t)
530    };
531
532    Some(CrawlPageResult {
533        url: url.to_string(),
534        depth,
535        status: CrawlStatus::Ok,
536        title,
537        content: content.map(|c| crate::sanitize::sanitize(&c).into_owned()),
538        error: None,
539        links_found,
540        fetched_at,
541    })
542}
543
544/// Fetch result crossing the `JoinSet` boundary; error is pre-stringified for `Send`.
545struct FetchOutcome {
546    url: Url,
547    depth: usize,
548    result: Result<bridge::ServoPage, String>,
549    fetched_at: SystemTime,
550}
551
552fn error_result(url: &Url, depth: usize, error: String, fetched_at: SystemTime) -> CrawlPageResult {
553    CrawlPageResult {
554        url: url.to_string(),
555        depth,
556        status: CrawlStatus::Error,
557        title: None,
558        content: None,
559        error: Some(error),
560        links_found: 0,
561        fetched_at,
562    }
563}
564
565#[cfg(test)]
566mod tests {
567    use super::*;
568    use std::collections::HashMap;
569    use std::sync::Arc;
570
571    #[test]
572    fn crawl_options_defaults() {
573        let opts = CrawlOptions::new("https://example.com");
574        assert_eq!(opts.url, "https://example.com");
575        assert_eq!(opts.limit, 50);
576        assert_eq!(opts.max_depth, 3);
577        assert_eq!(opts.timeout, Duration::from_secs(30));
578        assert!(opts.include.is_empty());
579        assert!(opts.exclude.is_empty());
580        assert_eq!(opts.concurrency, 1);
581        assert_eq!(opts.delay, Some(Duration::from_millis(500)));
582    }
583
584    #[test]
585    fn crawl_options_chaining() {
586        let opts = CrawlOptions::new("https://example.com")
587            .limit(100)
588            .max_depth(5)
589            .timeout(Duration::from_secs(60))
590            .include(&["/docs/**"])
591            .exclude(&["/docs/archive/**"])
592            .concurrency(4)
593            .delay(None);
594        assert_eq!(opts.limit, 100);
595        assert_eq!(opts.max_depth, 5);
596        assert_eq!(opts.include, vec!["/docs/**"]);
597        assert_eq!(opts.exclude, vec!["/docs/archive/**"]);
598        assert_eq!(opts.concurrency, 4);
599        assert_eq!(opts.delay, None);
600    }
601
602    #[test]
603    fn crawl_options_concurrency_clamps_below_one() {
604        let opts = CrawlOptions::new("https://example.com").concurrency(0);
605        assert_eq!(opts.concurrency, 1);
606    }
607
608    #[test]
609    fn crawl_options_delay_custom_value() {
610        let opts = CrawlOptions::new("https://example.com").delay(Some(Duration::from_secs(2)));
611        assert_eq!(opts.delay, Some(Duration::from_secs(2)));
612    }
613
614    #[test]
615    fn crawl_user_agent_sanitizes_crlf() {
616        let opts = CrawlOptions::new("https://example.com").user_agent("Crawler\r\n/2.0");
617        assert_eq!(opts.user_agent.as_deref(), Some("Crawler  /2.0"));
618    }
619
620    #[derive(Clone)]
621    struct MockFetcher(Arc<HashMap<String, String>>);
622
623    impl MockFetcher {
624        fn new(pages: &[(&str, &str)]) -> Self {
625            Self(Arc::new(
626                pages.iter().map(|(u, h)| (u.to_string(), h.to_string())).collect(),
627            ))
628        }
629    }
630
631    impl PageFetcher for MockFetcher {
632        fn fetch_page(&self, opts: bridge::FetchOptions<'_>) -> anyhow::Result<bridge::ServoPage> {
633            self.0
634                .get(opts.url)
635                .map(|html| bridge::ServoPage {
636                    html: html.clone(),
637                    ..Default::default()
638                })
639                .ok_or_else(|| anyhow::anyhow!("not found: {}", opts.url))
640        }
641    }
642
643    fn page(links: &[&str]) -> String {
644        use std::fmt::Write;
645        let mut anchors = String::new();
646        for l in links {
647            write!(anchors, r#"<a href="{l}">link</a>"#).unwrap();
648        }
649        format!("<html><head><title>Test</title></head><body>{anchors}</body></html>")
650    }
651
652    /// Leaf page with unique body to avoid content-hash dedup.
653    fn distinct_page(tag: &str) -> String {
654        format!("<html><head><title>{tag}</title></head><body>page {tag}</body></html>")
655    }
656
657    /// Test helper: build `CrawlPlan`, run, assert. `delay=None` keeps tests fast.
658    async fn check(
659        pages: &[(&str, &str)],
660        configure: impl FnOnce(&mut CrawlPlan),
661        assert: impl FnOnce(&[CrawlPageResult]),
662    ) {
663        let fetcher = MockFetcher::new(pages);
664        let seed = pages[0].0;
665        let mut opts = CrawlPlan {
666            seed: Url::parse(seed).unwrap(),
667            limit: 50,
668            max_depth: 3,
669            timeout_secs: 30,
670            settle_ms: 0,
671            include: None,
672            exclude: None,
673            selector: None,
674            json: false,
675            user_agent: None,
676            concurrency: 1,
677            delay: None,
678        };
679        configure(&mut opts);
680        let results = run(opts, RobotsPolicy::Unavailable, &fetcher, |_| {}).await;
681        assert(&results);
682    }
683
684    #[tokio::test]
685    async fn crawl_single_page() {
686        check(
687            &[("https://example.com/", &page(&[]))],
688            |_| {},
689            |r| {
690                assert_eq!(r.len(), 1);
691                assert_eq!(r[0].url, "https://example.com/");
692            },
693        )
694        .await;
695    }
696
697    #[tokio::test]
698    async fn crawl_follows_links() {
699        check(
700            &[
701                ("https://example.com/", &page(&["/a", "/b"])),
702                (
703                    "https://example.com/a",
704                    "<html><head><title>A</title></head><body>page a</body></html>",
705                ),
706                (
707                    "https://example.com/b",
708                    "<html><head><title>B</title></head><body>page b</body></html>",
709                ),
710            ],
711            |_| {},
712            |r| assert_eq!(r.len(), 3),
713        )
714        .await;
715    }
716
717    #[tokio::test]
718    async fn crawl_respects_depth_limit() {
719        check(
720            &[
721                ("https://example.com/", &page(&["/a"])),
722                ("https://example.com/a", &page(&["/b"])),
723                ("https://example.com/b", &page(&["/c"])),
724                ("https://example.com/c", &page(&[])),
725            ],
726            |o| o.max_depth = 1,
727            |r| assert_eq!(r.len(), 2),
728        )
729        .await;
730    }
731
732    #[tokio::test]
733    async fn crawl_respects_limit() {
734        check(
735            &[
736                ("https://example.com/", &page(&["/a", "/b", "/c"])),
737                ("https://example.com/a", &page(&[])),
738                ("https://example.com/b", &page(&[])),
739                ("https://example.com/c", &page(&[])),
740            ],
741            |o| o.limit = 2,
742            |r| assert_eq!(r.len(), 2),
743        )
744        .await;
745    }
746
747    #[tokio::test]
748    async fn crawl_skips_cross_site_links() {
749        check(
750            &[
751                ("https://example.com/", &page(&["https://other.com/x"])),
752                ("https://other.com/x", &page(&[])),
753            ],
754            |_| {},
755            |r| assert_eq!(r.len(), 1),
756        )
757        .await;
758    }
759
760    #[tokio::test]
761    async fn crawl_deduplicates_urls() {
762        check(
763            &[
764                ("https://example.com/", &page(&["/a", "/a", "/a"])),
765                ("https://example.com/a", &page(&["/"])),
766            ],
767            |_| {},
768            |r| assert_eq!(r.len(), 2),
769        )
770        .await;
771    }
772
773    #[tokio::test]
774    async fn crawl_handles_fetch_errors() {
775        check(
776            &[("https://example.com/", &page(&["/missing"]))],
777            |_| {},
778            |r| {
779                assert_eq!(r.len(), 2);
780                assert!(matches!(r[1].status, CrawlStatus::Error));
781                assert!(r[1].error.is_some());
782            },
783        )
784        .await;
785    }
786
787    #[tokio::test]
788    async fn crawl_applies_include_glob() {
789        check(
790            &[
791                ("https://example.com/", &page(&["/docs/a", "/blog/b"])),
792                ("https://example.com/docs/a", &page(&[])),
793                ("https://example.com/blog/b", &page(&[])),
794            ],
795            |o| o.include = Some(crate::scope::build_globset(&["/docs/**".into()]).unwrap()),
796            |r| {
797                assert_eq!(r.len(), 2);
798                assert!(r.iter().any(|p| p.url == "https://example.com/docs/a"));
799                assert!(!r.iter().any(|p| p.url == "https://example.com/blog/b"));
800            },
801        )
802        .await;
803    }
804
805    #[tokio::test]
806    async fn crawl_applies_exclude_glob() {
807        check(
808            &[
809                ("https://example.com/", &page(&["/public", "/secret/data"])),
810                ("https://example.com/public", &page(&[])),
811                ("https://example.com/secret/data", &page(&[])),
812            ],
813            |o| o.exclude = Some(crate::scope::build_globset(&["/secret/**".into()]).unwrap()),
814            |r| {
815                assert_eq!(r.len(), 2);
816                assert!(!r.iter().any(|p| p.url == "https://example.com/secret/data"));
817            },
818        )
819        .await;
820    }
821
822    #[tokio::test]
823    async fn crawl_deduplicates_content() {
824        let same = "<html><head><title>Same</title></head><body>identical</body></html>";
825        check(
826            &[
827                ("https://example.com/", &page(&["/a", "/b"])),
828                ("https://example.com/a", same),
829                ("https://example.com/b", same),
830            ],
831            |_| {},
832            |r| assert_eq!(r.len(), 2),
833        )
834        .await;
835    }
836
837    #[tokio::test]
838    async fn crawl_concurrency_visits_all_pages() {
839        check(
840            &[
841                ("https://example.com/", &page(&["/a", "/b", "/c", "/d"])),
842                ("https://example.com/a", &distinct_page("a")),
843                ("https://example.com/b", &distinct_page("b")),
844                ("https://example.com/c", &distinct_page("c")),
845                ("https://example.com/d", &distinct_page("d")),
846            ],
847            |o| o.concurrency = 4,
848            |r| {
849                assert_eq!(r.len(), 5);
850                let urls: HashSet<&str> = r.iter().map(|p| p.url.as_str()).collect();
851                for u in [
852                    "https://example.com/",
853                    "https://example.com/a",
854                    "https://example.com/b",
855                    "https://example.com/c",
856                    "https://example.com/d",
857                ] {
858                    assert!(urls.contains(u), "missing {u}");
859                }
860            },
861        )
862        .await;
863    }
864
865    #[tokio::test]
866    async fn crawl_concurrency_respects_limit() {
867        check(
868            &[
869                ("https://example.com/", &page(&["/a", "/b", "/c", "/d"])),
870                ("https://example.com/a", &distinct_page("a")),
871                ("https://example.com/b", &distinct_page("b")),
872                ("https://example.com/c", &distinct_page("c")),
873                ("https://example.com/d", &distinct_page("d")),
874            ],
875            |o| {
876                o.concurrency = 4;
877                o.limit = 3;
878            },
879            |r| assert_eq!(r.len(), 3),
880        )
881        .await;
882    }
883
884    #[tokio::test]
885    async fn crawl_concurrency_one_preserves_bfs_order() {
886        check(
887            &[
888                ("https://example.com/", &page(&["/a", "/b"])),
889                ("https://example.com/a", &distinct_page("a")),
890                ("https://example.com/b", &distinct_page("b")),
891            ],
892            |o| o.concurrency = 1,
893            |r| {
894                assert_eq!(r.len(), 3);
895                assert_eq!(r[0].url, "https://example.com/");
896                assert_eq!(r[1].url, "https://example.com/a");
897                assert_eq!(r[2].url, "https://example.com/b");
898            },
899        )
900        .await;
901    }
902
903    #[tokio::test(start_paused = true)]
904    async fn crawl_delay_enforces_minimum_interval() {
905        // 3 pages at 500ms delay = 2 ticks >= 1s (first dispatch is free).
906        let start = tokio::time::Instant::now();
907        check(
908            &[
909                ("https://example.com/", &page(&["/a", "/b"])),
910                ("https://example.com/a", &distinct_page("a")),
911                ("https://example.com/b", &distinct_page("b")),
912            ],
913            |o| {
914                o.concurrency = 1;
915                o.delay = Some(Duration::from_millis(500));
916            },
917            |r| assert_eq!(r.len(), 3),
918        )
919        .await;
920        let elapsed = start.elapsed();
921        assert!(
922            elapsed >= Duration::from_secs(1),
923            "expected >= 1s for 3 pages with 500ms delay, got {elapsed:?}"
924        );
925    }
926
927    #[test]
928    fn frontier_dedup() {
929        let seed = Url::parse("https://example.com/").unwrap();
930        let mut f = Frontier::new(&seed);
931        assert!(!f.try_enqueue(seed, 0));
932        let other = Url::parse("https://example.com/page").unwrap();
933        assert!(f.try_enqueue(other.clone(), 1));
934        assert!(!f.try_enqueue(other, 1));
935    }
936
937    #[test]
938    fn frontier_pop_and_pending() {
939        let seed = Url::parse("https://example.com/").unwrap();
940        let mut f = Frontier::new(&seed);
941        assert_eq!(f.pending(), 1);
942        let (url, depth) = f.pop().unwrap();
943        assert_eq!(url.as_str(), "https://example.com/");
944        assert_eq!(depth, 0);
945        assert_eq!(f.pending(), 0);
946        assert!(f.pop().is_none());
947    }
948
949    #[test]
950    fn extract_links_filters_dangerous_schemes() {
951        let html = r#"<a href="https://example.com/a">A</a>
952            <a href="javascript:void(0)">JS</a>
953            <a href="JAVASCRIPT:alert(1)">JS upper</a>
954            <a href="data:text/html,<h1>hi</h1>">Data</a>
955            <a href="mailto:x@y.com">Mail</a>
956            <a href="/relative">Rel</a>"#;
957        let base = Url::parse("https://example.com/").unwrap();
958        let links = extract_links_from_html(html, &base);
959        assert_eq!(links.len(), 2);
960        assert_eq!(links[0].as_str(), "https://example.com/a");
961        assert_eq!(links[1].as_str(), "https://example.com/relative");
962    }
963
964    #[test]
965    fn error_result_fields() {
966        let url = Url::parse("https://example.com/fail").unwrap();
967        let r = error_result(&url, 2, "timeout".into(), SystemTime::now());
968        assert!(matches!(r.status, CrawlStatus::Error));
969        assert_eq!(r.error.as_deref(), Some("timeout"));
970        assert!(r.content.is_none());
971    }
972
973    #[test]
974    fn content_hash_dedup() {
975        let seed = Url::parse("https://example.com/").unwrap();
976        let mut f = Frontier::new(&seed);
977        assert!(!f.is_duplicate_content("unique content"));
978        assert!(f.is_duplicate_content("unique content"));
979        assert!(!f.is_duplicate_content("different content"));
980    }
981}
servo_fetch/crawl.rs

servo_fetch/
crawl.rs