servo_fetch/
crawl.rs

1//! Site crawling — BFS link traversal with scope, robots.txt, and rate limiting.
2
3use std::collections::{HashSet, VecDeque};
4use std::hash::{DefaultHasher, Hash, Hasher};
5use std::time::{Duration, SystemTime};
6
7use tokio::task::{JoinSet, spawn_blocking};
8use tokio::time::{MissedTickBehavior, interval};
9use url::Url;
10
11use crate::bridge::{self, PageFetcher};
12use crate::net;
13use crate::robots::RobotsPolicy;
14use crate::scope::{is_same_site, matches_scope, normalize_url};
15
16const MAX_HTML_BYTES: usize = 2 * 1024 * 1024;
17
18/// Options for crawling a site.
19#[must_use = "options do nothing until passed to crawl() or crawl_each()"]
20#[derive(Debug, Clone)]
21pub struct CrawlOptions {
22    pub(crate) url: String,
23    pub(crate) limit: usize,
24    pub(crate) max_depth: usize,
25    pub(crate) timeout: Duration,
26    pub(crate) settle: Duration,
27    pub(crate) include: Vec<String>,
28    pub(crate) exclude: Vec<String>,
29    pub(crate) selector: Option<String>,
30    pub(crate) json: bool,
31    pub(crate) user_agent: Option<String>,
32    pub(crate) concurrency: usize,
33    pub(crate) delay: Option<Duration>,
34}
35
36impl CrawlOptions {
37    /// Create crawl options for the given seed URL.
38    pub fn new(url: &str) -> Self {
39        Self {
40            url: url.into(),
41            limit: 50,
42            max_depth: 3,
43            timeout: Duration::from_secs(30),
44            settle: Duration::ZERO,
45            include: Vec::new(),
46            exclude: Vec::new(),
47            selector: None,
48            json: false,
49            user_agent: None,
50            concurrency: 1,
51            delay: Some(Duration::from_millis(500)),
52        }
53    }
54
55    /// Maximum number of pages to crawl (default: 50).
56    pub fn limit(mut self, n: usize) -> Self {
57        self.limit = n;
58        self
59    }
60
61    /// Maximum link depth from the seed URL (default: 3).
62    pub fn max_depth(mut self, n: usize) -> Self {
63        self.max_depth = n;
64        self
65    }
66
67    /// Page load timeout per page (default: 30s).
68    pub fn timeout(mut self, timeout: Duration) -> Self {
69        self.timeout = timeout;
70        self
71    }
72
73    /// Extra wait after load event per page (default: 0).
74    pub fn settle(mut self, settle: Duration) -> Self {
75        self.settle = settle;
76        self
77    }
78
79    /// URL path glob patterns to include (e.g. `"/docs/**"`).
80    pub fn include(mut self, patterns: &[&str]) -> Self {
81        self.include = patterns.iter().map(|s| (*s).to_string()).collect();
82        self
83    }
84
85    /// URL path glob patterns to exclude (e.g. `"/docs/archive/**"`).
86    pub fn exclude(mut self, patterns: &[&str]) -> Self {
87        self.exclude = patterns.iter().map(|s| (*s).to_string()).collect();
88        self
89    }
90
91    /// Output crawled content as JSON instead of Markdown.
92    pub fn json(mut self, json: bool) -> Self {
93        self.json = json;
94        self
95    }
96
97    /// CSS selector to extract a specific section per page.
98    pub fn selector(mut self, selector: impl Into<String>) -> Self {
99        self.selector = Some(selector.into());
100        self
101    }
102
103    /// Override the User-Agent string for all pages in this crawl.
104    pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
105        self.user_agent = Some(net::sanitize_user_agent(ua.into()));
106        self
107    }
108
109    /// Maximum parallel fetches (default: 1). Values below 1 are clamped to 1.
110    /// Results are yielded in completion order when greater than 1.
111    pub fn concurrency(mut self, n: usize) -> Self {
112        self.concurrency = n.max(1);
113        self
114    }
115
116    /// Minimum dispatch interval (default: `Some(500ms)`). `None` disables rate limiting.
117    pub fn delay(mut self, delay: Option<Duration>) -> Self {
118        self.delay = delay;
119        self
120    }
121}
122
123/// Result for a single crawled page.
124#[derive(Debug)]
125#[non_exhaustive]
126pub struct CrawlResult {
127    /// URL of the crawled page.
128    pub url: String,
129    /// Link depth from the seed URL.
130    pub depth: usize,
131    /// Wall-clock time when the fetch completed.
132    pub fetched_at: SystemTime,
133    /// Page content if successful, or error if failed.
134    pub outcome: Result<CrawlPage, crate::error::Error>,
135}
136
137/// Successfully crawled page.
138#[derive(Debug, Clone)]
139pub struct CrawlPage {
140    /// Page title.
141    pub title: Option<String>,
142    /// Extracted content (Markdown or JSON depending on options).
143    pub content: String,
144    /// Number of links discovered on this page.
145    pub links_found: usize,
146}
147
148impl serde::Serialize for CrawlResult {
149    fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
150        use serde::ser::SerializeMap;
151        let fetched_at = humantime::format_rfc3339_millis(self.fetched_at).to_string();
152        match &self.outcome {
153            Ok(page) => {
154                let mut map = serializer.serialize_map(None)?;
155                map.serialize_entry("type", "page")?;
156                map.serialize_entry("url", &self.url)?;
157                map.serialize_entry("depth", &self.depth)?;
158                map.serialize_entry("fetched_at", &fetched_at)?;
159                if let Some(t) = &page.title {
160                    map.serialize_entry("title", t)?;
161                }
162                map.serialize_entry("content", &page.content)?;
163                map.serialize_entry("links_found", &page.links_found)?;
164                map.end()
165            }
166            Err(e) => {
167                let mut map = serializer.serialize_map(None)?;
168                map.serialize_entry("type", "error")?;
169                map.serialize_entry("url", &self.url)?;
170                map.serialize_entry("depth", &self.depth)?;
171                map.serialize_entry("fetched_at", &fetched_at)?;
172                map.serialize_entry("error", &e.to_string())?;
173                map.end()
174            }
175        }
176    }
177}
178
179impl CrawlResult {
180    fn from_internal(r: CrawlPageResult) -> Self {
181        let outcome = match r.status {
182            CrawlStatus::Ok => Ok(CrawlPage {
183                title: r.title,
184                content: r.content.unwrap_or_default(),
185                links_found: r.links_found,
186            }),
187            CrawlStatus::Error => Err(r
188                .error
189                .unwrap_or_else(|| crate::error::Error::engine("unknown crawl error", None))),
190        };
191        Self {
192            url: r.url,
193            depth: r.depth,
194            fetched_at: r.fetched_at,
195            outcome,
196        }
197    }
198}
199
200/// Crawl a site, invoking `on_page` for each result as it arrives.
201#[allow(clippy::needless_pass_by_value)]
202pub fn crawl_each(opts: CrawlOptions, mut on_page: impl FnMut(CrawlResult)) -> crate::error::Result<()> {
203    net::ensure_crypto_provider();
204    let plan = build_crawl_plan(&opts)?;
205    crate::runtime::block_on(async {
206        let robots = spawn_blocking({
207            let seed = plan.seed.clone();
208            let user_agent = plan.user_agent.clone();
209            let timeout = Duration::from_secs(plan.timeout_secs);
210            move || crate::robots::RobotsRules::fetch(&seed, user_agent.as_deref(), timeout)
211        })
212        .await
213        .unwrap_or(RobotsPolicy::Unreachable);
214        run(plan, robots, &bridge::ServoFetcher, |r| {
215            on_page(CrawlResult::from_internal(r));
216        })
217        .await;
218    })
219    .map_err(|e| crate::error::Error::engine(e, None))
220}
221
222/// Crawl a site and collect all results.
223#[allow(clippy::needless_pass_by_value)]
224pub fn crawl(opts: CrawlOptions) -> crate::error::Result<Vec<CrawlResult>> {
225    let mut results = Vec::new();
226    crawl_each(opts, |r| results.push(r))?;
227    Ok(results)
228}
229
230fn build_crawl_plan(opts: &CrawlOptions) -> crate::error::Result<CrawlPlan> {
231    let seed = net::validate_url(&opts.url)?;
232    let include = if opts.include.is_empty() {
233        None
234    } else {
235        Some(crate::scope::build_globset(&opts.include)?)
236    };
237    let exclude = if opts.exclude.is_empty() {
238        None
239    } else {
240        Some(crate::scope::build_globset(&opts.exclude)?)
241    };
242    Ok(CrawlPlan {
243        seed,
244        limit: opts.limit,
245        max_depth: opts.max_depth,
246        timeout_secs: opts.timeout.as_secs().max(1),
247        settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
248        include,
249        exclude,
250        selector: opts.selector.clone(),
251        json: opts.json,
252        user_agent: opts.user_agent.clone(),
253        concurrency: opts.concurrency,
254        delay: opts.delay,
255    })
256}
257
258/// Crawl configuration.
259pub(crate) struct CrawlPlan {
260    pub seed: Url,
261    pub limit: usize,
262    pub max_depth: usize,
263    pub timeout_secs: u64,
264    pub settle_ms: u64,
265    pub include: Option<globset::GlobSet>,
266    pub exclude: Option<globset::GlobSet>,
267    pub selector: Option<String>,
268    pub json: bool,
269    pub user_agent: Option<String>,
270    /// Parallel fetch limit (clamped to >=1; yields in completion order when >1).
271    pub concurrency: usize,
272    /// Dispatch interval; `None` disables rate limiting.
273    pub delay: Option<Duration>,
274}
275
276/// Result for a single crawled page.
277pub(crate) struct CrawlPageResult {
278    pub url: String,
279    pub depth: usize,
280    pub status: CrawlStatus,
281    pub title: Option<String>,
282    pub content: Option<String>,
283    pub error: Option<crate::error::Error>,
284    pub links_found: usize,
285    pub fetched_at: SystemTime,
286}
287
288/// Status of a crawled page.
289pub(crate) enum CrawlStatus {
290    Ok,
291    Error,
292}
293
294struct Frontier {
295    queue: VecDeque<(Url, usize)>,
296    visited: HashSet<String>,
297    content_hashes: HashSet<u64>,
298}
299
300impl Frontier {
301    fn new(seed: &Url) -> Self {
302        Self {
303            queue: VecDeque::from([(seed.clone(), 0)]),
304            visited: HashSet::from([normalize_url(seed)]),
305            content_hashes: HashSet::new(),
306        }
307    }
308
309    fn try_enqueue(&mut self, url: Url, depth: usize) -> bool {
310        if self.visited.insert(normalize_url(&url)) {
311            self.queue.push_back((url, depth));
312            true
313        } else {
314            false
315        }
316    }
317
318    fn pop(&mut self) -> Option<(Url, usize)> {
319        self.queue.pop_front()
320    }
321
322    fn is_duplicate_content(&mut self, content: &str) -> bool {
323        let mut h = DefaultHasher::new();
324        content.hash(&mut h);
325        !self.content_hashes.insert(h.finish())
326    }
327
328    fn pending(&self) -> usize {
329        self.queue.len()
330    }
331}
332
333fn extract_links_from_html(html: &str, base: &Url) -> Vec<Url> {
334    dom_query::Document::from(html)
335        .select("a[href]")
336        .iter()
337        .filter_map(|el| {
338            let href = el.attr("href")?;
339            let href = href.trim();
340            if href.is_empty() {
341                return None;
342            }
343            let resolved = base.join(href).ok()?;
344            matches!(resolved.scheme(), "http" | "https").then_some(resolved)
345        })
346        .collect()
347}
348
349pub(crate) async fn run(
350    opts: CrawlPlan,
351    robots: RobotsPolicy,
352    fetcher: &(impl PageFetcher + Clone),
353    mut on_page: impl FnMut(CrawlPageResult),
354) {
355    let mut frontier = Frontier::new(&opts.seed);
356    let mut completed: usize = 0;
357    let mut in_flight: JoinSet<FetchOutcome> = JoinSet::new();
358
359    // `Delay` keeps the steady-state rate correct after fetches exceed `delay`.
360    let mut ticker = opts.delay.map(|period| {
361        let mut t = interval(period);
362        t.set_missed_tick_behavior(MissedTickBehavior::Delay);
363        t
364    });
365
366    let concurrency = opts.concurrency.max(1);
367
368    loop {
369        while in_flight.len() < concurrency && completed + in_flight.len() < opts.limit {
370            let Some((url, depth)) = frontier.pop() else {
371                break;
372            };
373            if let Some(t) = ticker.as_mut() {
374                t.tick().await;
375            }
376            spawn_fetch(&mut in_flight, fetcher, &opts, url, depth);
377        }
378
379        let outcome = match in_flight.join_next().await {
380            None => break,
381            Some(Ok(o)) => o,
382            Some(Err(e)) if e.is_panic() => {
383                tracing::error!(err = %e, "crawl fetch task panicked");
384                continue;
385            }
386            Some(Err(e)) => {
387                tracing::warn!(err = %e, "crawl fetch task cancelled");
388                continue;
389            }
390        };
391
392        let FetchOutcome {
393            url,
394            depth,
395            result,
396            fetched_at,
397        } = outcome;
398        let page = match result {
399            Ok(p) => p,
400            Err(err) => {
401                on_page(error_result(&url, depth, err, fetched_at));
402                completed += 1;
403                continue;
404            }
405        };
406
407        let budget_used = completed + in_flight.len() + 1;
408        let mut ctx = CrawlContext {
409            frontier: &mut frontier,
410            robots: &robots,
411            opts: &opts,
412        };
413        if let Some(r) = process_ok_fetch(&mut ctx, &url, depth, &page, budget_used, fetched_at) {
414            on_page(r);
415            completed += 1;
416        }
417    }
418}
419
420fn spawn_fetch(
421    in_flight: &mut JoinSet<FetchOutcome>,
422    fetcher: &(impl PageFetcher + Clone),
423    opts: &CrawlPlan,
424    url: Url,
425    depth: usize,
426) {
427    let url_str = url.to_string();
428    let timeout = opts.timeout_secs;
429    let settle = opts.settle_ms;
430    let user_agent = opts.user_agent.clone();
431    let f = fetcher.clone();
432    in_flight.spawn_blocking(move || {
433        let result = f
434            .fetch_page(bridge::FetchOptions {
435                url: &url_str,
436                timeout_secs: timeout,
437                settle_ms: settle,
438                mode: bridge::FetchMode::Content { include_a11y: false },
439                user_agent: user_agent.as_deref(),
440            })
441            .map_err(|e| crate::error::Error::engine(e, Some(url_str.clone())));
442        FetchOutcome {
443            url,
444            depth,
445            result,
446            fetched_at: SystemTime::now(),
447        }
448    });
449}
450
451/// Stable crawl state passed to `process_ok_fetch`.
452struct CrawlContext<'a> {
453    frontier: &'a mut Frontier,
454    robots: &'a RobotsPolicy,
455    opts: &'a CrawlPlan,
456}
457
458/// Build a `CrawlPageResult` and enqueue discovered links.
459fn process_ok_fetch(
460    ctx: &mut CrawlContext<'_>,
461    url: &Url,
462    depth: usize,
463    page: &bridge::ServoPage,
464    budget_used: usize,
465    fetched_at: SystemTime,
466) -> Option<CrawlPageResult> {
467    let html = if page.html.len() > MAX_HTML_BYTES {
468        &page.html[..crate::sanitize::floor_char_boundary(&page.html, MAX_HTML_BYTES)]
469    } else {
470        &page.html
471    };
472
473    let input = crate::extract::ExtractInput::new(html, url.as_str())
474        .with_layout_json(page.layout_json.as_deref())
475        .with_inner_text(page.inner_text.as_deref())
476        .with_selector(ctx.opts.selector.as_deref());
477
478    let content = if ctx.opts.json {
479        crate::extract::extract_json(&input).ok()
480    } else {
481        crate::extract::extract_text(&input).ok()
482    };
483
484    if content.as_ref().is_some_and(|c| ctx.frontier.is_duplicate_content(c)) {
485        return None;
486    }
487
488    let links = extract_links_from_html(html, url);
489    let links_found = links.len();
490
491    if depth < ctx.opts.max_depth {
492        for link in &links {
493            if budget_used + ctx.frontier.pending() >= ctx.opts.limit {
494                break;
495            }
496            if !is_same_site(&ctx.opts.seed, link)
497                || net::validate_url_with_policy(link.as_str(), bridge::engine_policy()).is_err()
498                || !ctx.robots.is_allowed(link)
499                || !matches_scope(link, ctx.opts.include.as_ref(), ctx.opts.exclude.as_ref())
500            {
501                continue;
502            }
503            ctx.frontier.try_enqueue(link.clone(), depth + 1);
504        }
505    }
506
507    let title = {
508        let doc = dom_query::Document::from(html);
509        let t = doc.select("title").text().to_string();
510        (!t.is_empty()).then_some(t)
511    };
512
513    Some(CrawlPageResult {
514        url: url.to_string(),
515        depth,
516        status: CrawlStatus::Ok,
517        title,
518        content: content.map(|c| crate::sanitize::sanitize(&c).into_owned()),
519        error: None,
520        links_found,
521        fetched_at,
522    })
523}
524
525/// Fetch result crossing the `JoinSet` boundary.
526struct FetchOutcome {
527    url: Url,
528    depth: usize,
529    result: Result<bridge::ServoPage, crate::error::Error>,
530    fetched_at: SystemTime,
531}
532
533fn error_result(url: &Url, depth: usize, error: crate::error::Error, fetched_at: SystemTime) -> CrawlPageResult {
534    CrawlPageResult {
535        url: url.to_string(),
536        depth,
537        status: CrawlStatus::Error,
538        title: None,
539        content: None,
540        error: Some(error),
541        links_found: 0,
542        fetched_at,
543    }
544}
545
546#[cfg(test)]
547mod tests {
548    use std::collections::HashMap;
549    use std::sync::Arc;
550
551    use super::*;
552
553    #[test]
554    fn crawl_options_defaults() {
555        let opts = CrawlOptions::new("https://example.com");
556        assert_eq!(opts.url, "https://example.com");
557        assert_eq!(opts.limit, 50);
558        assert_eq!(opts.max_depth, 3);
559        assert_eq!(opts.timeout, Duration::from_secs(30));
560        assert!(opts.include.is_empty());
561        assert!(opts.exclude.is_empty());
562        assert_eq!(opts.concurrency, 1);
563        assert_eq!(opts.delay, Some(Duration::from_millis(500)));
564    }
565
566    #[test]
567    fn crawl_options_chaining() {
568        let opts = CrawlOptions::new("https://example.com")
569            .limit(100)
570            .max_depth(5)
571            .timeout(Duration::from_secs(60))
572            .include(&["/docs/**"])
573            .exclude(&["/docs/archive/**"])
574            .concurrency(4)
575            .delay(None);
576        assert_eq!(opts.limit, 100);
577        assert_eq!(opts.max_depth, 5);
578        assert_eq!(opts.include, vec!["/docs/**"]);
579        assert_eq!(opts.exclude, vec!["/docs/archive/**"]);
580        assert_eq!(opts.concurrency, 4);
581        assert_eq!(opts.delay, None);
582    }
583
584    #[test]
585    fn crawl_options_concurrency_clamps_below_one() {
586        let opts = CrawlOptions::new("https://example.com").concurrency(0);
587        assert_eq!(opts.concurrency, 1);
588    }
589
590    #[test]
591    fn crawl_options_delay_custom_value() {
592        let opts = CrawlOptions::new("https://example.com").delay(Some(Duration::from_secs(2)));
593        assert_eq!(opts.delay, Some(Duration::from_secs(2)));
594    }
595
596    #[test]
597    fn crawl_user_agent_sanitizes_crlf() {
598        let opts = CrawlOptions::new("https://example.com").user_agent("Crawler\r\n/2.0");
599        assert_eq!(opts.user_agent.as_deref(), Some("Crawler  /2.0"));
600    }
601
602    #[derive(Clone)]
603    struct MockFetcher(Arc<HashMap<String, String>>);
604
605    impl MockFetcher {
606        fn new(pages: &[(&str, &str)]) -> Self {
607            Self(Arc::new(
608                pages.iter().map(|(u, h)| (u.to_string(), h.to_string())).collect(),
609            ))
610        }
611    }
612
613    impl PageFetcher for MockFetcher {
614        fn fetch_page(&self, opts: bridge::FetchOptions<'_>) -> anyhow::Result<bridge::ServoPage> {
615            self.0
616                .get(opts.url)
617                .map(|html| bridge::ServoPage {
618                    html: html.clone(),
619                    ..Default::default()
620                })
621                .ok_or_else(|| anyhow::anyhow!("not found: {}", opts.url))
622        }
623    }
624
625    fn page(links: &[&str]) -> String {
626        use std::fmt::Write as _;
627        let mut anchors = String::new();
628        for l in links {
629            write!(anchors, r#"<a href="{l}">link</a>"#).unwrap();
630        }
631        format!("<html><head><title>Test</title></head><body>{anchors}</body></html>")
632    }
633
634    /// Leaf page with unique body to avoid content-hash dedup.
635    fn distinct_page(tag: &str) -> String {
636        format!("<html><head><title>{tag}</title></head><body>page {tag}</body></html>")
637    }
638
639    /// Test helper: build `CrawlPlan`, run, assert. `delay=None` keeps tests fast.
640    async fn check(
641        pages: &[(&str, &str)],
642        configure: impl FnOnce(&mut CrawlPlan),
643        assert: impl FnOnce(&[CrawlPageResult]),
644    ) {
645        let fetcher = MockFetcher::new(pages);
646        let seed = pages[0].0;
647        let mut opts = CrawlPlan {
648            seed: Url::parse(seed).unwrap(),
649            limit: 50,
650            max_depth: 3,
651            timeout_secs: 30,
652            settle_ms: 0,
653            include: None,
654            exclude: None,
655            selector: None,
656            json: false,
657            user_agent: None,
658            concurrency: 1,
659            delay: None,
660        };
661        configure(&mut opts);
662        let mut results = Vec::new();
663        run(opts, RobotsPolicy::Unavailable, &fetcher, |r| results.push(r)).await;
664        assert(&results);
665    }
666
667    #[tokio::test]
668    async fn crawl_single_page() {
669        check(
670            &[("https://example.com/", &page(&[]))],
671            |_| {},
672            |r| {
673                assert_eq!(r.len(), 1);
674                assert_eq!(r[0].url, "https://example.com/");
675            },
676        )
677        .await;
678    }
679
680    #[tokio::test]
681    async fn crawl_follows_links() {
682        check(
683            &[
684                ("https://example.com/", &page(&["/a", "/b"])),
685                (
686                    "https://example.com/a",
687                    "<html><head><title>A</title></head><body>page a</body></html>",
688                ),
689                (
690                    "https://example.com/b",
691                    "<html><head><title>B</title></head><body>page b</body></html>",
692                ),
693            ],
694            |_| {},
695            |r| assert_eq!(r.len(), 3),
696        )
697        .await;
698    }
699
700    #[tokio::test]
701    async fn crawl_respects_depth_limit() {
702        check(
703            &[
704                ("https://example.com/", &page(&["/a"])),
705                ("https://example.com/a", &page(&["/b"])),
706                ("https://example.com/b", &page(&["/c"])),
707                ("https://example.com/c", &page(&[])),
708            ],
709            |o| o.max_depth = 1,
710            |r| assert_eq!(r.len(), 2),
711        )
712        .await;
713    }
714
715    #[tokio::test]
716    async fn crawl_respects_limit() {
717        check(
718            &[
719                ("https://example.com/", &page(&["/a", "/b", "/c"])),
720                ("https://example.com/a", &page(&[])),
721                ("https://example.com/b", &page(&[])),
722                ("https://example.com/c", &page(&[])),
723            ],
724            |o| o.limit = 2,
725            |r| assert_eq!(r.len(), 2),
726        )
727        .await;
728    }
729
730    #[tokio::test]
731    async fn crawl_skips_cross_site_links() {
732        check(
733            &[
734                ("https://example.com/", &page(&["https://other.com/x"])),
735                ("https://other.com/x", &page(&[])),
736            ],
737            |_| {},
738            |r| assert_eq!(r.len(), 1),
739        )
740        .await;
741    }
742
743    #[tokio::test]
744    async fn crawl_deduplicates_urls() {
745        check(
746            &[
747                ("https://example.com/", &page(&["/a", "/a", "/a"])),
748                ("https://example.com/a", &page(&["/"])),
749            ],
750            |_| {},
751            |r| assert_eq!(r.len(), 2),
752        )
753        .await;
754    }
755
756    #[tokio::test]
757    async fn crawl_handles_fetch_errors() {
758        check(
759            &[("https://example.com/", &page(&["/missing"]))],
760            |_| {},
761            |r| {
762                assert_eq!(r.len(), 2);
763                assert!(matches!(r[1].status, CrawlStatus::Error));
764                assert!(r[1].error.is_some());
765            },
766        )
767        .await;
768    }
769
770    #[tokio::test]
771    async fn crawl_applies_include_glob() {
772        check(
773            &[
774                ("https://example.com/", &page(&["/docs/a", "/blog/b"])),
775                ("https://example.com/docs/a", &page(&[])),
776                ("https://example.com/blog/b", &page(&[])),
777            ],
778            |o| o.include = Some(crate::scope::build_globset(&["/docs/**".into()]).unwrap()),
779            |r| {
780                assert_eq!(r.len(), 2);
781                assert!(r.iter().any(|p| p.url == "https://example.com/docs/a"));
782                assert!(!r.iter().any(|p| p.url == "https://example.com/blog/b"));
783            },
784        )
785        .await;
786    }
787
788    #[tokio::test]
789    async fn crawl_applies_exclude_glob() {
790        check(
791            &[
792                ("https://example.com/", &page(&["/public", "/secret/data"])),
793                ("https://example.com/public", &page(&[])),
794                ("https://example.com/secret/data", &page(&[])),
795            ],
796            |o| o.exclude = Some(crate::scope::build_globset(&["/secret/**".into()]).unwrap()),
797            |r| {
798                assert_eq!(r.len(), 2);
799                assert!(!r.iter().any(|p| p.url == "https://example.com/secret/data"));
800            },
801        )
802        .await;
803    }
804
805    #[tokio::test]
806    async fn crawl_deduplicates_content() {
807        let same = "<html><head><title>Same</title></head><body>identical</body></html>";
808        check(
809            &[
810                ("https://example.com/", &page(&["/a", "/b"])),
811                ("https://example.com/a", same),
812                ("https://example.com/b", same),
813            ],
814            |_| {},
815            |r| assert_eq!(r.len(), 2),
816        )
817        .await;
818    }
819
820    #[tokio::test]
821    async fn crawl_concurrency_visits_all_pages() {
822        check(
823            &[
824                ("https://example.com/", &page(&["/a", "/b", "/c", "/d"])),
825                ("https://example.com/a", &distinct_page("a")),
826                ("https://example.com/b", &distinct_page("b")),
827                ("https://example.com/c", &distinct_page("c")),
828                ("https://example.com/d", &distinct_page("d")),
829            ],
830            |o| o.concurrency = 4,
831            |r| {
832                assert_eq!(r.len(), 5);
833                let urls: HashSet<&str> = r.iter().map(|p| p.url.as_str()).collect();
834                for u in [
835                    "https://example.com/",
836                    "https://example.com/a",
837                    "https://example.com/b",
838                    "https://example.com/c",
839                    "https://example.com/d",
840                ] {
841                    assert!(urls.contains(u), "missing {u}");
842                }
843            },
844        )
845        .await;
846    }
847
848    #[tokio::test]
849    async fn crawl_concurrency_respects_limit() {
850        check(
851            &[
852                ("https://example.com/", &page(&["/a", "/b", "/c", "/d"])),
853                ("https://example.com/a", &distinct_page("a")),
854                ("https://example.com/b", &distinct_page("b")),
855                ("https://example.com/c", &distinct_page("c")),
856                ("https://example.com/d", &distinct_page("d")),
857            ],
858            |o| {
859                o.concurrency = 4;
860                o.limit = 3;
861            },
862            |r| assert_eq!(r.len(), 3),
863        )
864        .await;
865    }
866
867    #[tokio::test]
868    async fn crawl_concurrency_one_preserves_bfs_order() {
869        check(
870            &[
871                ("https://example.com/", &page(&["/a", "/b"])),
872                ("https://example.com/a", &distinct_page("a")),
873                ("https://example.com/b", &distinct_page("b")),
874            ],
875            |o| o.concurrency = 1,
876            |r| {
877                assert_eq!(r.len(), 3);
878                assert_eq!(r[0].url, "https://example.com/");
879                assert_eq!(r[1].url, "https://example.com/a");
880                assert_eq!(r[2].url, "https://example.com/b");
881            },
882        )
883        .await;
884    }
885
886    #[tokio::test(start_paused = true)]
887    async fn crawl_delay_enforces_minimum_interval() {
888        // 3 pages at 500ms delay = 2 ticks >= 1s (first dispatch is free).
889        let start = tokio::time::Instant::now();
890        check(
891            &[
892                ("https://example.com/", &page(&["/a", "/b"])),
893                ("https://example.com/a", &distinct_page("a")),
894                ("https://example.com/b", &distinct_page("b")),
895            ],
896            |o| {
897                o.concurrency = 1;
898                o.delay = Some(Duration::from_millis(500));
899            },
900            |r| assert_eq!(r.len(), 3),
901        )
902        .await;
903        let elapsed = start.elapsed();
904        assert!(
905            elapsed >= Duration::from_secs(1),
906            "expected >= 1s for 3 pages with 500ms delay, got {elapsed:?}"
907        );
908    }
909
910    #[test]
911    fn frontier_dedup() {
912        let seed = Url::parse("https://example.com/").unwrap();
913        let mut f = Frontier::new(&seed);
914        assert!(!f.try_enqueue(seed, 0));
915        let other = Url::parse("https://example.com/page").unwrap();
916        assert!(f.try_enqueue(other.clone(), 1));
917        assert!(!f.try_enqueue(other, 1));
918    }
919
920    #[test]
921    fn frontier_pop_and_pending() {
922        let seed = Url::parse("https://example.com/").unwrap();
923        let mut f = Frontier::new(&seed);
924        assert_eq!(f.pending(), 1);
925        let (url, depth) = f.pop().unwrap();
926        assert_eq!(url.as_str(), "https://example.com/");
927        assert_eq!(depth, 0);
928        assert_eq!(f.pending(), 0);
929        assert!(f.pop().is_none());
930    }
931
932    #[test]
933    fn extract_links_filters_dangerous_schemes() {
934        let html = r#"<a href="https://example.com/a">A</a>
935            <a href="javascript:void(0)">JS</a>
936            <a href="JAVASCRIPT:alert(1)">JS upper</a>
937            <a href="data:text/html,<h1>hi</h1>">Data</a>
938            <a href="mailto:x@y.com">Mail</a>
939            <a href="/relative">Rel</a>"#;
940        let base = Url::parse("https://example.com/").unwrap();
941        let links = extract_links_from_html(html, &base);
942        assert_eq!(links.len(), 2);
943        assert_eq!(links[0].as_str(), "https://example.com/a");
944        assert_eq!(links[1].as_str(), "https://example.com/relative");
945    }
946
947    #[test]
948    fn error_result_fields() {
949        let url = Url::parse("https://example.com/fail").unwrap();
950        let r = error_result(&url, 2, crate::error::Error::engine("timeout", None), SystemTime::now());
951        assert!(matches!(r.status, CrawlStatus::Error));
952        assert!(r.error.as_ref().is_some_and(|e| e.to_string().contains("timeout")));
953        assert!(r.content.is_none());
954    }
955
956    #[test]
957    fn content_hash_dedup() {
958        let seed = Url::parse("https://example.com/").unwrap();
959        let mut f = Frontier::new(&seed);
960        assert!(!f.is_duplicate_content("unique content"));
961        assert!(f.is_duplicate_content("unique content"));
962        assert!(!f.is_duplicate_content("different content"));
963    }
964}
servo_fetch/crawl.rs

servo_fetch/
crawl.rs