servo_fetch/
crawl.rs

1//! Site crawling — BFS link traversal with scope, robots.txt, and rate limiting.
2
3use std::collections::{HashSet, VecDeque};
4use std::hash::{DefaultHasher, Hash, Hasher};
5use std::time::{Duration, SystemTime};
6
7use tokio::task::{JoinSet, spawn_blocking};
8use tokio::time::{MissedTickBehavior, interval};
9use url::Url;
10
11use crate::bridge::{self, PageFetcher};
12use crate::net;
13use crate::robots::RobotsPolicy;
14use crate::scope::{is_same_site, matches_scope, normalize_url};
15
16const MAX_HTML_BYTES: usize = 2 * 1024 * 1024;
17
18/// Options for crawling a site.
19#[must_use = "options do nothing until passed to crawl() or crawl_each()"]
20#[derive(Debug, Clone)]
21pub struct CrawlOptions {
22    pub(crate) url: String,
23    pub(crate) limit: usize,
24    pub(crate) max_depth: usize,
25    pub(crate) timeout: Duration,
26    pub(crate) settle: Duration,
27    pub(crate) include: Vec<String>,
28    pub(crate) exclude: Vec<String>,
29    pub(crate) selector: Option<String>,
30    pub(crate) json: bool,
31    pub(crate) user_agent: Option<String>,
32    pub(crate) concurrency: usize,
33    pub(crate) delay: Option<Duration>,
34    pub(crate) cookies: Vec<crate::cookies::CookieSpec>,
35}
36
37impl CrawlOptions {
38    /// Create crawl options for the given seed URL.
39    pub fn new(url: &str) -> Self {
40        Self {
41            url: url.into(),
42            limit: 50,
43            max_depth: 3,
44            timeout: Duration::from_secs(30),
45            settle: Duration::ZERO,
46            include: Vec::new(),
47            exclude: Vec::new(),
48            selector: None,
49            json: false,
50            user_agent: None,
51            concurrency: 1,
52            delay: Some(Duration::from_millis(500)),
53            cookies: Vec::new(),
54        }
55    }
56
57    /// Maximum number of pages to crawl (default: 50).
58    pub fn limit(mut self, n: usize) -> Self {
59        self.limit = n;
60        self
61    }
62
63    /// Maximum link depth from the seed URL (default: 3).
64    pub fn max_depth(mut self, n: usize) -> Self {
65        self.max_depth = n;
66        self
67    }
68
69    /// Page load timeout per page (default: 30s).
70    pub fn timeout(mut self, timeout: Duration) -> Self {
71        self.timeout = timeout;
72        self
73    }
74
75    /// Extra wait after load event per page (default: 0).
76    pub fn settle(mut self, settle: Duration) -> Self {
77        self.settle = settle;
78        self
79    }
80
81    /// URL path glob patterns to include (e.g. `"/docs/**"`).
82    pub fn include(mut self, patterns: &[&str]) -> Self {
83        self.include = patterns.iter().map(|s| (*s).to_string()).collect();
84        self
85    }
86
87    /// URL path glob patterns to exclude (e.g. `"/docs/archive/**"`).
88    pub fn exclude(mut self, patterns: &[&str]) -> Self {
89        self.exclude = patterns.iter().map(|s| (*s).to_string()).collect();
90        self
91    }
92
93    /// Output crawled content as JSON instead of Markdown.
94    pub fn json(mut self, json: bool) -> Self {
95        self.json = json;
96        self
97    }
98
99    /// CSS selector to extract a specific section per page.
100    pub fn selector(mut self, selector: impl Into<String>) -> Self {
101        self.selector = Some(selector.into());
102        self
103    }
104
105    /// Override the User-Agent string for all pages in this crawl.
106    pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
107        self.user_agent = Some(net::sanitize_user_agent(ua.into()));
108        self
109    }
110
111    /// Maximum parallel fetches (default: 1). Values below 1 are clamped to 1.
112    /// Results are yielded in completion order when greater than 1.
113    pub fn concurrency(mut self, n: usize) -> Self {
114        self.concurrency = n.max(1);
115        self
116    }
117
118    /// Minimum dispatch interval (default: `Some(500ms)`). `None` disables rate limiting.
119    pub fn delay(mut self, delay: Option<Duration>) -> Self {
120        self.delay = delay;
121        self
122    }
123
124    /// Seed session cookies before crawling, scoped to the seed's site.
125    pub fn cookies(mut self, cookies: Vec<crate::cookies::CookieSpec>) -> Self {
126        self.cookies = cookies;
127        self
128    }
129}
130
131/// Result for a single crawled page.
132#[derive(Debug)]
133#[non_exhaustive]
134pub struct CrawlResult {
135    /// URL of the crawled page.
136    pub url: String,
137    /// Link depth from the seed URL.
138    pub depth: usize,
139    /// Wall-clock time when the fetch completed.
140    pub fetched_at: SystemTime,
141    /// Page content if successful, or error if failed.
142    pub outcome: Result<CrawlPage, crate::error::Error>,
143}
144
145/// Successfully crawled page.
146#[derive(Debug, Clone)]
147pub struct CrawlPage {
148    /// Page title.
149    pub title: Option<String>,
150    /// Extracted content (Markdown or JSON depending on options).
151    pub content: String,
152    /// Number of links discovered on this page.
153    pub links_found: usize,
154}
155
156impl serde::Serialize for CrawlResult {
157    fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
158        use serde::ser::SerializeMap;
159        let fetched_at = humantime::format_rfc3339_millis(self.fetched_at).to_string();
160        match &self.outcome {
161            Ok(page) => {
162                let mut map = serializer.serialize_map(None)?;
163                map.serialize_entry("type", "page")?;
164                map.serialize_entry("url", &self.url)?;
165                map.serialize_entry("depth", &self.depth)?;
166                map.serialize_entry("fetched_at", &fetched_at)?;
167                if let Some(t) = &page.title {
168                    map.serialize_entry("title", t)?;
169                }
170                map.serialize_entry("content", &page.content)?;
171                map.serialize_entry("links_found", &page.links_found)?;
172                map.end()
173            }
174            Err(e) => {
175                let mut map = serializer.serialize_map(None)?;
176                map.serialize_entry("type", "error")?;
177                map.serialize_entry("url", &self.url)?;
178                map.serialize_entry("depth", &self.depth)?;
179                map.serialize_entry("fetched_at", &fetched_at)?;
180                map.serialize_entry("error", &e.to_string())?;
181                map.end()
182            }
183        }
184    }
185}
186
187impl CrawlResult {
188    fn from_internal(r: CrawlPageResult) -> Self {
189        let outcome = match r.status {
190            CrawlStatus::Ok => Ok(CrawlPage {
191                title: r.title,
192                content: r.content.unwrap_or_default(),
193                links_found: r.links_found,
194            }),
195            CrawlStatus::Error => Err(r
196                .error
197                .unwrap_or_else(|| crate::error::Error::engine("unknown crawl error", None))),
198        };
199        Self {
200            url: r.url,
201            depth: r.depth,
202            fetched_at: r.fetched_at,
203            outcome,
204        }
205    }
206}
207
208/// Crawl a site, invoking `on_page` for each result as it arrives (blocking).
209pub fn crawl_each_blocking<F>(opts: &CrawlOptions, on_page: F) -> crate::error::Result<()>
210where
211    F: FnMut(CrawlResult) + Send,
212{
213    crate::runtime::block_on(crawl_each(opts, on_page)).map_err(|e| crate::error::Error::engine(e, None))?
214}
215
216/// Crawl a site, invoking `on_page` for each result as it arrives.
217pub async fn crawl_each<F>(opts: &CrawlOptions, mut on_page: F) -> crate::error::Result<()>
218where
219    F: FnMut(CrawlResult) + Send,
220{
221    net::ensure_crypto_provider();
222    let plan = build_crawl_plan(opts)?;
223    let robots = spawn_blocking({
224        let seed = plan.seed.clone();
225        let user_agent = plan.user_agent.clone();
226        let timeout = Duration::from_secs(plan.timeout_secs);
227        move || crate::robots::RobotsRules::fetch(&seed, user_agent.as_deref(), timeout)
228    })
229    .await
230    .unwrap_or(RobotsPolicy::Unreachable);
231    run(plan, robots, &bridge::ServoFetcher, |r| {
232        on_page(CrawlResult::from_internal(r));
233    })
234    .await;
235    Ok(())
236}
237
238/// Crawl a site and collect all results (blocking).
239pub fn crawl_blocking(opts: &CrawlOptions) -> crate::error::Result<Vec<CrawlResult>> {
240    let mut results = Vec::new();
241    crawl_each_blocking(opts, |r| results.push(r))?;
242    Ok(results)
243}
244
245/// Crawl a site and collect all results.
246pub async fn crawl(opts: &CrawlOptions) -> crate::error::Result<Vec<CrawlResult>> {
247    let mut results = Vec::new();
248    crawl_each(opts, |r| results.push(r)).await?;
249    Ok(results)
250}
251
252fn build_crawl_plan(opts: &CrawlOptions) -> crate::error::Result<CrawlPlan> {
253    let seed = net::validate_url(&opts.url)?;
254    let include = if opts.include.is_empty() {
255        None
256    } else {
257        Some(crate::scope::build_globset(&opts.include)?)
258    };
259    let exclude = if opts.exclude.is_empty() {
260        None
261    } else {
262        Some(crate::scope::build_globset(&opts.exclude)?)
263    };
264    Ok(CrawlPlan {
265        seed,
266        limit: opts.limit,
267        max_depth: opts.max_depth,
268        timeout_secs: opts.timeout.as_secs().max(1),
269        settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
270        include,
271        exclude,
272        selector: opts.selector.clone(),
273        json: opts.json,
274        user_agent: opts.user_agent.clone(),
275        concurrency: opts.concurrency,
276        delay: opts.delay,
277        cookies: opts.cookies.clone(),
278    })
279}
280
281/// Crawl configuration.
282pub(crate) struct CrawlPlan {
283    pub seed: Url,
284    pub limit: usize,
285    pub max_depth: usize,
286    pub timeout_secs: u64,
287    pub settle_ms: u64,
288    pub include: Option<globset::GlobSet>,
289    pub exclude: Option<globset::GlobSet>,
290    pub selector: Option<String>,
291    pub json: bool,
292    pub user_agent: Option<String>,
293    /// Parallel fetch limit (clamped to >=1; yields in completion order when >1).
294    pub concurrency: usize,
295    /// Dispatch interval; `None` disables rate limiting.
296    pub delay: Option<Duration>,
297    pub cookies: Vec<crate::cookies::CookieSpec>,
298}
299
300/// Result for a single crawled page.
301pub(crate) struct CrawlPageResult {
302    pub url: String,
303    pub depth: usize,
304    pub status: CrawlStatus,
305    pub title: Option<String>,
306    pub content: Option<String>,
307    pub error: Option<crate::error::Error>,
308    pub links_found: usize,
309    pub fetched_at: SystemTime,
310}
311
312/// Status of a crawled page.
313pub(crate) enum CrawlStatus {
314    Ok,
315    Error,
316}
317
318struct Frontier {
319    queue: VecDeque<(Url, usize)>,
320    visited: HashSet<String>,
321    content_hashes: HashSet<u64>,
322}
323
324impl Frontier {
325    fn new(seed: &Url) -> Self {
326        Self {
327            queue: VecDeque::from([(seed.clone(), 0)]),
328            visited: HashSet::from([normalize_url(seed)]),
329            content_hashes: HashSet::new(),
330        }
331    }
332
333    fn try_enqueue(&mut self, url: Url, depth: usize) -> bool {
334        if self.visited.insert(normalize_url(&url)) {
335            self.queue.push_back((url, depth));
336            true
337        } else {
338            false
339        }
340    }
341
342    fn pop(&mut self) -> Option<(Url, usize)> {
343        self.queue.pop_front()
344    }
345
346    fn is_duplicate_content(&mut self, content: &str) -> bool {
347        let mut h = DefaultHasher::new();
348        content.hash(&mut h);
349        !self.content_hashes.insert(h.finish())
350    }
351
352    fn pending(&self) -> usize {
353        self.queue.len()
354    }
355}
356
357fn extract_links_from_html(html: &str, base: &Url) -> Vec<Url> {
358    dom_query::Document::from(html)
359        .select("a[href]")
360        .iter()
361        .filter_map(|el| {
362            let href = el.attr("href")?;
363            let href = href.trim();
364            if href.is_empty() {
365                return None;
366            }
367            let resolved = base.join(href).ok()?;
368            matches!(resolved.scheme(), "http" | "https").then_some(resolved)
369        })
370        .collect()
371}
372
373pub(crate) async fn run(
374    opts: CrawlPlan,
375    robots: RobotsPolicy,
376    fetcher: &(impl PageFetcher + Clone),
377    mut on_page: impl FnMut(CrawlPageResult),
378) {
379    let mut frontier = Frontier::new(&opts.seed);
380    let mut completed: usize = 0;
381    let mut in_flight: JoinSet<FetchOutcome> = JoinSet::new();
382
383    // `Delay` keeps the steady-state rate correct after fetches exceed `delay`.
384    let mut ticker = opts.delay.map(|period| {
385        let mut t = interval(period);
386        t.set_missed_tick_behavior(MissedTickBehavior::Delay);
387        t
388    });
389
390    let concurrency = opts.concurrency.max(1);
391
392    loop {
393        while in_flight.len() < concurrency && completed + in_flight.len() < opts.limit {
394            let Some((url, depth)) = frontier.pop() else {
395                break;
396            };
397            if let Some(t) = ticker.as_mut() {
398                t.tick().await;
399            }
400            spawn_fetch(&mut in_flight, fetcher, &opts, url, depth);
401        }
402
403        let outcome = match in_flight.join_next().await {
404            None => break,
405            Some(Ok(o)) => o,
406            Some(Err(e)) if e.is_panic() => {
407                tracing::error!(err = %e, "crawl fetch task panicked");
408                continue;
409            }
410            Some(Err(e)) => {
411                tracing::warn!(err = %e, "crawl fetch task cancelled");
412                continue;
413            }
414        };
415
416        let FetchOutcome {
417            url,
418            depth,
419            result,
420            fetched_at,
421        } = outcome;
422        let page = match result {
423            Ok(p) => p,
424            Err(err) => {
425                on_page(error_result(&url, depth, err, fetched_at));
426                completed += 1;
427                continue;
428            }
429        };
430
431        let budget_used = completed + in_flight.len() + 1;
432        let mut ctx = CrawlContext {
433            frontier: &mut frontier,
434            robots: &robots,
435            opts: &opts,
436        };
437        if let Some(r) = process_ok_fetch(&mut ctx, &url, depth, &page, budget_used, fetched_at) {
438            on_page(r);
439            completed += 1;
440        }
441    }
442}
443
444fn spawn_fetch(
445    in_flight: &mut JoinSet<FetchOutcome>,
446    fetcher: &(impl PageFetcher + Clone),
447    opts: &CrawlPlan,
448    url: Url,
449    depth: usize,
450) {
451    let url_str = url.to_string();
452    let timeout = opts.timeout_secs;
453    let settle = opts.settle_ms;
454    let user_agent = opts.user_agent.clone();
455    let cookies = opts.cookies.clone();
456    let f = fetcher.clone();
457    in_flight.spawn_blocking(move || {
458        let result = f
459            .fetch_page(bridge::FetchOptions {
460                url: &url_str,
461                timeout_secs: timeout,
462                settle_ms: settle,
463                mode: bridge::FetchMode::Content { include_a11y: false },
464                user_agent: user_agent.as_deref(),
465                cookies: &cookies,
466            })
467            .map_err(|e| crate::error::Error::engine(e, Some(url_str.clone())));
468        FetchOutcome {
469            url,
470            depth,
471            result,
472            fetched_at: SystemTime::now(),
473        }
474    });
475}
476
477/// Stable crawl state passed to `process_ok_fetch`.
478struct CrawlContext<'a> {
479    frontier: &'a mut Frontier,
480    robots: &'a RobotsPolicy,
481    opts: &'a CrawlPlan,
482}
483
484/// Build a `CrawlPageResult` and enqueue discovered links.
485fn process_ok_fetch(
486    ctx: &mut CrawlContext<'_>,
487    url: &Url,
488    depth: usize,
489    page: &bridge::ServoPage,
490    budget_used: usize,
491    fetched_at: SystemTime,
492) -> Option<CrawlPageResult> {
493    let html = if page.html.len() > MAX_HTML_BYTES {
494        &page.html[..crate::sanitize::floor_char_boundary(&page.html, MAX_HTML_BYTES)]
495    } else {
496        &page.html
497    };
498
499    let input = crate::extract::ExtractInput::new(html, url.as_str())
500        .with_layout_json(page.layout_json.as_deref())
501        .with_inner_text(page.inner_text.as_deref())
502        .with_selector(ctx.opts.selector.as_deref());
503
504    let content = if ctx.opts.json {
505        crate::extract::extract_json(&input).ok()
506    } else {
507        crate::extract::extract_text(&input).ok()
508    };
509
510    if content.as_ref().is_some_and(|c| ctx.frontier.is_duplicate_content(c)) {
511        return None;
512    }
513
514    let links = extract_links_from_html(html, url);
515    let links_found = links.len();
516
517    if depth < ctx.opts.max_depth {
518        for link in &links {
519            if budget_used + ctx.frontier.pending() >= ctx.opts.limit {
520                break;
521            }
522            if !is_same_site(&ctx.opts.seed, link)
523                || net::validate_url_with_policy(link.as_str(), bridge::engine_policy()).is_err()
524                || !ctx.robots.is_allowed(link)
525                || !matches_scope(link, ctx.opts.include.as_ref(), ctx.opts.exclude.as_ref())
526            {
527                continue;
528            }
529            ctx.frontier.try_enqueue(link.clone(), depth + 1);
530        }
531    }
532
533    let title = {
534        let doc = dom_query::Document::from(html);
535        let t = doc.select("title").text().to_string();
536        (!t.is_empty()).then_some(t)
537    };
538
539    Some(CrawlPageResult {
540        url: url.to_string(),
541        depth,
542        status: CrawlStatus::Ok,
543        title,
544        content: content.map(|c| crate::sanitize::sanitize(&c).into_owned()),
545        error: None,
546        links_found,
547        fetched_at,
548    })
549}
550
551/// Fetch result crossing the `JoinSet` boundary.
552struct FetchOutcome {
553    url: Url,
554    depth: usize,
555    result: Result<bridge::ServoPage, crate::error::Error>,
556    fetched_at: SystemTime,
557}
558
559fn error_result(url: &Url, depth: usize, error: crate::error::Error, fetched_at: SystemTime) -> CrawlPageResult {
560    CrawlPageResult {
561        url: url.to_string(),
562        depth,
563        status: CrawlStatus::Error,
564        title: None,
565        content: None,
566        error: Some(error),
567        links_found: 0,
568        fetched_at,
569    }
570}
571
572#[cfg(test)]
573mod tests {
574    use std::collections::HashMap;
575    use std::sync::Arc;
576
577    use super::*;
578
579    #[test]
580    fn crawl_options_defaults() {
581        let opts = CrawlOptions::new("https://example.com");
582        assert_eq!(opts.url, "https://example.com");
583        assert_eq!(opts.limit, 50);
584        assert_eq!(opts.max_depth, 3);
585        assert_eq!(opts.timeout, Duration::from_secs(30));
586        assert!(opts.include.is_empty());
587        assert!(opts.exclude.is_empty());
588        assert_eq!(opts.concurrency, 1);
589        assert_eq!(opts.delay, Some(Duration::from_millis(500)));
590    }
591
592    #[test]
593    fn crawl_options_chaining() {
594        let opts = CrawlOptions::new("https://example.com")
595            .limit(100)
596            .max_depth(5)
597            .timeout(Duration::from_secs(60))
598            .include(&["/docs/**"])
599            .exclude(&["/docs/archive/**"])
600            .concurrency(4)
601            .delay(None);
602        assert_eq!(opts.limit, 100);
603        assert_eq!(opts.max_depth, 5);
604        assert_eq!(opts.include, vec!["/docs/**"]);
605        assert_eq!(opts.exclude, vec!["/docs/archive/**"]);
606        assert_eq!(opts.concurrency, 4);
607        assert_eq!(opts.delay, None);
608    }
609
610    #[test]
611    fn crawl_options_concurrency_clamps_below_one() {
612        let opts = CrawlOptions::new("https://example.com").concurrency(0);
613        assert_eq!(opts.concurrency, 1);
614    }
615
616    #[test]
617    fn crawl_options_delay_custom_value() {
618        let opts = CrawlOptions::new("https://example.com").delay(Some(Duration::from_secs(2)));
619        assert_eq!(opts.delay, Some(Duration::from_secs(2)));
620    }
621
622    #[test]
623    fn crawl_user_agent_sanitizes_crlf() {
624        let opts = CrawlOptions::new("https://example.com").user_agent("Crawler\r\n/2.0");
625        assert_eq!(opts.user_agent.as_deref(), Some("Crawler  /2.0"));
626    }
627
628    #[derive(Clone)]
629    struct MockFetcher(Arc<HashMap<String, String>>);
630
631    impl MockFetcher {
632        fn new(pages: &[(&str, &str)]) -> Self {
633            Self(Arc::new(
634                pages.iter().map(|(u, h)| (u.to_string(), h.to_string())).collect(),
635            ))
636        }
637    }
638
639    impl PageFetcher for MockFetcher {
640        fn fetch_page(&self, opts: bridge::FetchOptions<'_>) -> Result<bridge::ServoPage, bridge::EngineError> {
641            self.0
642                .get(opts.url)
643                .map(|html| bridge::ServoPage {
644                    html: html.clone(),
645                    ..Default::default()
646                })
647                .ok_or_else(|| bridge::EngineError::Other(anyhow::anyhow!("not found: {}", opts.url)))
648        }
649    }
650
651    fn page(links: &[&str]) -> String {
652        use std::fmt::Write as _;
653        let mut anchors = String::new();
654        for l in links {
655            write!(anchors, r#"<a href="{l}">link</a>"#).unwrap();
656        }
657        format!("<html><head><title>Test</title></head><body>{anchors}</body></html>")
658    }
659
660    /// Leaf page with unique body to avoid content-hash dedup.
661    fn distinct_page(tag: &str) -> String {
662        format!("<html><head><title>{tag}</title></head><body>page {tag}</body></html>")
663    }
664
665    /// Test helper: build `CrawlPlan`, run, assert. `delay=None` keeps tests fast.
666    async fn check(
667        pages: &[(&str, &str)],
668        configure: impl FnOnce(&mut CrawlPlan),
669        assert: impl FnOnce(&[CrawlPageResult]),
670    ) {
671        let fetcher = MockFetcher::new(pages);
672        let seed = pages[0].0;
673        let mut opts = CrawlPlan {
674            seed: Url::parse(seed).unwrap(),
675            limit: 50,
676            max_depth: 3,
677            timeout_secs: 30,
678            settle_ms: 0,
679            include: None,
680            exclude: None,
681            selector: None,
682            json: false,
683            user_agent: None,
684            concurrency: 1,
685            delay: None,
686            cookies: Vec::new(),
687        };
688        configure(&mut opts);
689        let mut results = Vec::new();
690        run(opts, RobotsPolicy::Unavailable, &fetcher, |r| results.push(r)).await;
691        assert(&results);
692    }
693
694    #[tokio::test]
695    async fn crawl_single_page() {
696        check(
697            &[("https://example.com/", &page(&[]))],
698            |_| {},
699            |r| {
700                assert_eq!(r.len(), 1);
701                assert_eq!(r[0].url, "https://example.com/");
702            },
703        )
704        .await;
705    }
706
707    #[tokio::test]
708    async fn crawl_follows_links() {
709        check(
710            &[
711                ("https://example.com/", &page(&["/a", "/b"])),
712                (
713                    "https://example.com/a",
714                    "<html><head><title>A</title></head><body>page a</body></html>",
715                ),
716                (
717                    "https://example.com/b",
718                    "<html><head><title>B</title></head><body>page b</body></html>",
719                ),
720            ],
721            |_| {},
722            |r| assert_eq!(r.len(), 3),
723        )
724        .await;
725    }
726
727    #[tokio::test]
728    async fn crawl_respects_depth_limit() {
729        check(
730            &[
731                ("https://example.com/", &page(&["/a"])),
732                ("https://example.com/a", &page(&["/b"])),
733                ("https://example.com/b", &page(&["/c"])),
734                ("https://example.com/c", &page(&[])),
735            ],
736            |o| o.max_depth = 1,
737            |r| assert_eq!(r.len(), 2),
738        )
739        .await;
740    }
741
742    #[tokio::test]
743    async fn crawl_respects_limit() {
744        check(
745            &[
746                ("https://example.com/", &page(&["/a", "/b", "/c"])),
747                ("https://example.com/a", &page(&[])),
748                ("https://example.com/b", &page(&[])),
749                ("https://example.com/c", &page(&[])),
750            ],
751            |o| o.limit = 2,
752            |r| assert_eq!(r.len(), 2),
753        )
754        .await;
755    }
756
757    #[tokio::test]
758    async fn crawl_skips_cross_site_links() {
759        check(
760            &[
761                ("https://example.com/", &page(&["https://other.com/x"])),
762                ("https://other.com/x", &page(&[])),
763            ],
764            |_| {},
765            |r| assert_eq!(r.len(), 1),
766        )
767        .await;
768    }
769
770    #[tokio::test]
771    async fn crawl_deduplicates_urls() {
772        check(
773            &[
774                ("https://example.com/", &page(&["/a", "/a", "/a"])),
775                ("https://example.com/a", &page(&["/"])),
776            ],
777            |_| {},
778            |r| assert_eq!(r.len(), 2),
779        )
780        .await;
781    }
782
783    #[tokio::test]
784    async fn crawl_handles_fetch_errors() {
785        check(
786            &[("https://example.com/", &page(&["/missing"]))],
787            |_| {},
788            |r| {
789                assert_eq!(r.len(), 2);
790                assert!(matches!(r[1].status, CrawlStatus::Error));
791                assert!(r[1].error.is_some());
792            },
793        )
794        .await;
795    }
796
797    #[tokio::test]
798    async fn crawl_applies_include_glob() {
799        check(
800            &[
801                ("https://example.com/", &page(&["/docs/a", "/blog/b"])),
802                ("https://example.com/docs/a", &page(&[])),
803                ("https://example.com/blog/b", &page(&[])),
804            ],
805            |o| o.include = Some(crate::scope::build_globset(&["/docs/**".into()]).unwrap()),
806            |r| {
807                assert_eq!(r.len(), 2);
808                assert!(r.iter().any(|p| p.url == "https://example.com/docs/a"));
809                assert!(!r.iter().any(|p| p.url == "https://example.com/blog/b"));
810            },
811        )
812        .await;
813    }
814
815    #[tokio::test]
816    async fn crawl_applies_exclude_glob() {
817        check(
818            &[
819                ("https://example.com/", &page(&["/public", "/secret/data"])),
820                ("https://example.com/public", &page(&[])),
821                ("https://example.com/secret/data", &page(&[])),
822            ],
823            |o| o.exclude = Some(crate::scope::build_globset(&["/secret/**".into()]).unwrap()),
824            |r| {
825                assert_eq!(r.len(), 2);
826                assert!(!r.iter().any(|p| p.url == "https://example.com/secret/data"));
827            },
828        )
829        .await;
830    }
831
832    #[tokio::test]
833    async fn crawl_deduplicates_content() {
834        let same = "<html><head><title>Same</title></head><body>identical</body></html>";
835        check(
836            &[
837                ("https://example.com/", &page(&["/a", "/b"])),
838                ("https://example.com/a", same),
839                ("https://example.com/b", same),
840            ],
841            |_| {},
842            |r| assert_eq!(r.len(), 2),
843        )
844        .await;
845    }
846
847    #[tokio::test]
848    async fn crawl_concurrency_visits_all_pages() {
849        check(
850            &[
851                ("https://example.com/", &page(&["/a", "/b", "/c", "/d"])),
852                ("https://example.com/a", &distinct_page("a")),
853                ("https://example.com/b", &distinct_page("b")),
854                ("https://example.com/c", &distinct_page("c")),
855                ("https://example.com/d", &distinct_page("d")),
856            ],
857            |o| o.concurrency = 4,
858            |r| {
859                assert_eq!(r.len(), 5);
860                let urls: HashSet<&str> = r.iter().map(|p| p.url.as_str()).collect();
861                for u in [
862                    "https://example.com/",
863                    "https://example.com/a",
864                    "https://example.com/b",
865                    "https://example.com/c",
866                    "https://example.com/d",
867                ] {
868                    assert!(urls.contains(u), "missing {u}");
869                }
870            },
871        )
872        .await;
873    }
874
875    #[tokio::test]
876    async fn crawl_concurrency_respects_limit() {
877        check(
878            &[
879                ("https://example.com/", &page(&["/a", "/b", "/c", "/d"])),
880                ("https://example.com/a", &distinct_page("a")),
881                ("https://example.com/b", &distinct_page("b")),
882                ("https://example.com/c", &distinct_page("c")),
883                ("https://example.com/d", &distinct_page("d")),
884            ],
885            |o| {
886                o.concurrency = 4;
887                o.limit = 3;
888            },
889            |r| assert_eq!(r.len(), 3),
890        )
891        .await;
892    }
893
894    #[tokio::test]
895    async fn crawl_concurrency_one_preserves_bfs_order() {
896        check(
897            &[
898                ("https://example.com/", &page(&["/a", "/b"])),
899                ("https://example.com/a", &distinct_page("a")),
900                ("https://example.com/b", &distinct_page("b")),
901            ],
902            |o| o.concurrency = 1,
903            |r| {
904                assert_eq!(r.len(), 3);
905                assert_eq!(r[0].url, "https://example.com/");
906                assert_eq!(r[1].url, "https://example.com/a");
907                assert_eq!(r[2].url, "https://example.com/b");
908            },
909        )
910        .await;
911    }
912
913    #[tokio::test(start_paused = true)]
914    async fn crawl_delay_enforces_minimum_interval() {
915        // 3 pages at 500ms delay = 2 ticks >= 1s (first dispatch is free).
916        let start = tokio::time::Instant::now();
917        check(
918            &[
919                ("https://example.com/", &page(&["/a", "/b"])),
920                ("https://example.com/a", &distinct_page("a")),
921                ("https://example.com/b", &distinct_page("b")),
922            ],
923            |o| {
924                o.concurrency = 1;
925                o.delay = Some(Duration::from_millis(500));
926            },
927            |r| assert_eq!(r.len(), 3),
928        )
929        .await;
930        let elapsed = start.elapsed();
931        assert!(
932            elapsed >= Duration::from_secs(1),
933            "expected >= 1s for 3 pages with 500ms delay, got {elapsed:?}"
934        );
935    }
936
937    #[test]
938    fn frontier_dedup() {
939        let seed = Url::parse("https://example.com/").unwrap();
940        let mut f = Frontier::new(&seed);
941        assert!(!f.try_enqueue(seed, 0));
942        let other = Url::parse("https://example.com/page").unwrap();
943        assert!(f.try_enqueue(other.clone(), 1));
944        assert!(!f.try_enqueue(other, 1));
945    }
946
947    #[test]
948    fn frontier_pop_and_pending() {
949        let seed = Url::parse("https://example.com/").unwrap();
950        let mut f = Frontier::new(&seed);
951        assert_eq!(f.pending(), 1);
952        let (url, depth) = f.pop().unwrap();
953        assert_eq!(url.as_str(), "https://example.com/");
954        assert_eq!(depth, 0);
955        assert_eq!(f.pending(), 0);
956        assert!(f.pop().is_none());
957    }
958
959    #[test]
960    fn extract_links_filters_dangerous_schemes() {
961        let html = r#"<a href="https://example.com/a">A</a>
962            <a href="javascript:void(0)">JS</a>
963            <a href="JAVASCRIPT:alert(1)">JS upper</a>
964            <a href="data:text/html,<h1>hi</h1>">Data</a>
965            <a href="mailto:x@y.com">Mail</a>
966            <a href="/relative">Rel</a>"#;
967        let base = Url::parse("https://example.com/").unwrap();
968        let links = extract_links_from_html(html, &base);
969        assert_eq!(links.len(), 2);
970        assert_eq!(links[0].as_str(), "https://example.com/a");
971        assert_eq!(links[1].as_str(), "https://example.com/relative");
972    }
973
974    #[test]
975    fn error_result_fields() {
976        let url = Url::parse("https://example.com/fail").unwrap();
977        let r = error_result(&url, 2, crate::error::Error::engine("timeout", None), SystemTime::now());
978        assert!(matches!(r.status, CrawlStatus::Error));
979        assert!(r.error.as_ref().is_some_and(|e| e.to_string().contains("timeout")));
980        assert!(r.content.is_none());
981    }
982
983    #[test]
984    fn content_hash_dedup() {
985        let seed = Url::parse("https://example.com/").unwrap();
986        let mut f = Frontier::new(&seed);
987        assert!(!f.is_duplicate_content("unique content"));
988        assert!(f.is_duplicate_content("unique content"));
989        assert!(!f.is_duplicate_content("different content"));
990    }
991}
servo_fetch/crawl.rs

servo_fetch/
crawl.rs