1use std::collections::{HashSet, VecDeque};
4use std::hash::{DefaultHasher, Hash, Hasher};
5use std::time::{Duration, SystemTime};
6
7use tokio::task::{JoinSet, spawn_blocking};
8use tokio::time::{MissedTickBehavior, interval};
9use url::Url;
10
11use crate::bridge::{self, PageFetcher};
12use crate::net;
13use crate::robots::RobotsPolicy;
14use crate::scope::{is_same_site, matches_scope, normalize_url};
15
16const MAX_HTML_BYTES: usize = 2 * 1024 * 1024;
17
18#[must_use = "options do nothing until passed to crawl() or crawl_each()"]
20#[derive(Debug, Clone)]
21pub struct CrawlOptions {
22 pub(crate) url: String,
23 pub(crate) limit: usize,
24 pub(crate) max_depth: usize,
25 pub(crate) timeout: Duration,
26 pub(crate) settle: Duration,
27 pub(crate) include: Vec<String>,
28 pub(crate) exclude: Vec<String>,
29 pub(crate) selector: Option<String>,
30 pub(crate) json: bool,
31 pub(crate) user_agent: Option<String>,
32 pub(crate) concurrency: usize,
33 pub(crate) delay: Option<Duration>,
34 pub(crate) cookies: Vec<crate::cookies::CookieSpec>,
35}
36
37impl CrawlOptions {
38 pub fn new(url: &str) -> Self {
40 Self {
41 url: url.into(),
42 limit: 50,
43 max_depth: 3,
44 timeout: Duration::from_secs(30),
45 settle: Duration::ZERO,
46 include: Vec::new(),
47 exclude: Vec::new(),
48 selector: None,
49 json: false,
50 user_agent: None,
51 concurrency: 1,
52 delay: Some(Duration::from_millis(500)),
53 cookies: Vec::new(),
54 }
55 }
56
57 pub fn limit(mut self, n: usize) -> Self {
59 self.limit = n;
60 self
61 }
62
63 pub fn max_depth(mut self, n: usize) -> Self {
65 self.max_depth = n;
66 self
67 }
68
69 pub fn timeout(mut self, timeout: Duration) -> Self {
71 self.timeout = timeout;
72 self
73 }
74
75 pub fn settle(mut self, settle: Duration) -> Self {
77 self.settle = settle;
78 self
79 }
80
81 pub fn include(mut self, patterns: &[&str]) -> Self {
83 self.include = patterns.iter().map(|s| (*s).to_string()).collect();
84 self
85 }
86
87 pub fn exclude(mut self, patterns: &[&str]) -> Self {
89 self.exclude = patterns.iter().map(|s| (*s).to_string()).collect();
90 self
91 }
92
93 pub fn json(mut self, json: bool) -> Self {
95 self.json = json;
96 self
97 }
98
99 pub fn selector(mut self, selector: impl Into<String>) -> Self {
101 self.selector = Some(selector.into());
102 self
103 }
104
105 pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
107 self.user_agent = Some(net::sanitize_user_agent(ua.into()));
108 self
109 }
110
111 pub fn concurrency(mut self, n: usize) -> Self {
114 self.concurrency = n.max(1);
115 self
116 }
117
118 pub fn delay(mut self, delay: Option<Duration>) -> Self {
120 self.delay = delay;
121 self
122 }
123
124 pub fn cookies(mut self, cookies: Vec<crate::cookies::CookieSpec>) -> Self {
126 self.cookies = cookies;
127 self
128 }
129}
130
131#[derive(Debug)]
133#[non_exhaustive]
134pub struct CrawlResult {
135 pub url: String,
137 pub depth: usize,
139 pub fetched_at: SystemTime,
141 pub outcome: Result<CrawlPage, crate::error::Error>,
143}
144
145#[derive(Debug, Clone)]
147pub struct CrawlPage {
148 pub title: Option<String>,
150 pub content: String,
152 pub links_found: usize,
154}
155
156impl serde::Serialize for CrawlResult {
157 fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
158 use serde::ser::SerializeMap;
159 let fetched_at = humantime::format_rfc3339_millis(self.fetched_at).to_string();
160 match &self.outcome {
161 Ok(page) => {
162 let mut map = serializer.serialize_map(None)?;
163 map.serialize_entry("type", "page")?;
164 map.serialize_entry("url", &self.url)?;
165 map.serialize_entry("depth", &self.depth)?;
166 map.serialize_entry("fetched_at", &fetched_at)?;
167 if let Some(t) = &page.title {
168 map.serialize_entry("title", t)?;
169 }
170 map.serialize_entry("content", &page.content)?;
171 map.serialize_entry("links_found", &page.links_found)?;
172 map.end()
173 }
174 Err(e) => {
175 let mut map = serializer.serialize_map(None)?;
176 map.serialize_entry("type", "error")?;
177 map.serialize_entry("url", &self.url)?;
178 map.serialize_entry("depth", &self.depth)?;
179 map.serialize_entry("fetched_at", &fetched_at)?;
180 map.serialize_entry("error", &e.to_string())?;
181 map.end()
182 }
183 }
184 }
185}
186
187impl CrawlResult {
188 fn from_internal(r: CrawlPageResult) -> Self {
189 let outcome = match r.status {
190 CrawlStatus::Ok => Ok(CrawlPage {
191 title: r.title,
192 content: r.content.unwrap_or_default(),
193 links_found: r.links_found,
194 }),
195 CrawlStatus::Error => Err(r
196 .error
197 .unwrap_or_else(|| crate::error::Error::engine("unknown crawl error", None))),
198 };
199 Self {
200 url: r.url,
201 depth: r.depth,
202 fetched_at: r.fetched_at,
203 outcome,
204 }
205 }
206}
207
208pub fn crawl_each_blocking<F>(opts: &CrawlOptions, on_page: F) -> crate::error::Result<()>
210where
211 F: FnMut(CrawlResult) + Send,
212{
213 crate::runtime::block_on(crawl_each(opts, on_page)).map_err(|e| crate::error::Error::engine(e, None))?
214}
215
216pub async fn crawl_each<F>(opts: &CrawlOptions, mut on_page: F) -> crate::error::Result<()>
218where
219 F: FnMut(CrawlResult) + Send,
220{
221 net::ensure_crypto_provider();
222 let plan = build_crawl_plan(opts)?;
223 let robots = spawn_blocking({
224 let seed = plan.seed.clone();
225 let user_agent = plan.user_agent.clone();
226 let timeout = Duration::from_secs(plan.timeout_secs);
227 move || crate::robots::RobotsRules::fetch(&seed, user_agent.as_deref(), timeout)
228 })
229 .await
230 .unwrap_or(RobotsPolicy::Unreachable);
231 run(plan, robots, &bridge::ServoFetcher, |r| {
232 on_page(CrawlResult::from_internal(r));
233 })
234 .await;
235 Ok(())
236}
237
238pub fn crawl_blocking(opts: &CrawlOptions) -> crate::error::Result<Vec<CrawlResult>> {
240 let mut results = Vec::new();
241 crawl_each_blocking(opts, |r| results.push(r))?;
242 Ok(results)
243}
244
245pub async fn crawl(opts: &CrawlOptions) -> crate::error::Result<Vec<CrawlResult>> {
247 let mut results = Vec::new();
248 crawl_each(opts, |r| results.push(r)).await?;
249 Ok(results)
250}
251
252fn build_crawl_plan(opts: &CrawlOptions) -> crate::error::Result<CrawlPlan> {
253 let seed = net::validate_url(&opts.url)?;
254 let include = if opts.include.is_empty() {
255 None
256 } else {
257 Some(crate::scope::build_globset(&opts.include)?)
258 };
259 let exclude = if opts.exclude.is_empty() {
260 None
261 } else {
262 Some(crate::scope::build_globset(&opts.exclude)?)
263 };
264 Ok(CrawlPlan {
265 seed,
266 limit: opts.limit,
267 max_depth: opts.max_depth,
268 timeout_secs: opts.timeout.as_secs().max(1),
269 settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
270 include,
271 exclude,
272 selector: opts.selector.clone(),
273 json: opts.json,
274 user_agent: opts.user_agent.clone(),
275 concurrency: opts.concurrency,
276 delay: opts.delay,
277 cookies: opts.cookies.clone(),
278 })
279}
280
281pub(crate) struct CrawlPlan {
283 pub seed: Url,
284 pub limit: usize,
285 pub max_depth: usize,
286 pub timeout_secs: u64,
287 pub settle_ms: u64,
288 pub include: Option<globset::GlobSet>,
289 pub exclude: Option<globset::GlobSet>,
290 pub selector: Option<String>,
291 pub json: bool,
292 pub user_agent: Option<String>,
293 pub concurrency: usize,
295 pub delay: Option<Duration>,
297 pub cookies: Vec<crate::cookies::CookieSpec>,
298}
299
300pub(crate) struct CrawlPageResult {
302 pub url: String,
303 pub depth: usize,
304 pub status: CrawlStatus,
305 pub title: Option<String>,
306 pub content: Option<String>,
307 pub error: Option<crate::error::Error>,
308 pub links_found: usize,
309 pub fetched_at: SystemTime,
310}
311
312pub(crate) enum CrawlStatus {
314 Ok,
315 Error,
316}
317
318struct Frontier {
319 queue: VecDeque<(Url, usize)>,
320 visited: HashSet<String>,
321 content_hashes: HashSet<u64>,
322}
323
324impl Frontier {
325 fn new(seed: &Url) -> Self {
326 Self {
327 queue: VecDeque::from([(seed.clone(), 0)]),
328 visited: HashSet::from([normalize_url(seed)]),
329 content_hashes: HashSet::new(),
330 }
331 }
332
333 fn try_enqueue(&mut self, url: Url, depth: usize) -> bool {
334 if self.visited.insert(normalize_url(&url)) {
335 self.queue.push_back((url, depth));
336 true
337 } else {
338 false
339 }
340 }
341
342 fn pop(&mut self) -> Option<(Url, usize)> {
343 self.queue.pop_front()
344 }
345
346 fn is_duplicate_content(&mut self, content: &str) -> bool {
347 let mut h = DefaultHasher::new();
348 content.hash(&mut h);
349 !self.content_hashes.insert(h.finish())
350 }
351
352 fn pending(&self) -> usize {
353 self.queue.len()
354 }
355}
356
357fn extract_links_from_html(html: &str, base: &Url) -> Vec<Url> {
358 dom_query::Document::from(html)
359 .select("a[href]")
360 .iter()
361 .filter_map(|el| {
362 let href = el.attr("href")?;
363 let href = href.trim();
364 if href.is_empty() {
365 return None;
366 }
367 let resolved = base.join(href).ok()?;
368 matches!(resolved.scheme(), "http" | "https").then_some(resolved)
369 })
370 .collect()
371}
372
373pub(crate) async fn run(
374 opts: CrawlPlan,
375 robots: RobotsPolicy,
376 fetcher: &(impl PageFetcher + Clone),
377 mut on_page: impl FnMut(CrawlPageResult),
378) {
379 let mut frontier = Frontier::new(&opts.seed);
380 let mut completed: usize = 0;
381 let mut in_flight: JoinSet<FetchOutcome> = JoinSet::new();
382
383 let mut ticker = opts.delay.map(|period| {
385 let mut t = interval(period);
386 t.set_missed_tick_behavior(MissedTickBehavior::Delay);
387 t
388 });
389
390 let concurrency = opts.concurrency.max(1);
391
392 loop {
393 while in_flight.len() < concurrency && completed + in_flight.len() < opts.limit {
394 let Some((url, depth)) = frontier.pop() else {
395 break;
396 };
397 if let Some(t) = ticker.as_mut() {
398 t.tick().await;
399 }
400 spawn_fetch(&mut in_flight, fetcher, &opts, url, depth);
401 }
402
403 let outcome = match in_flight.join_next().await {
404 None => break,
405 Some(Ok(o)) => o,
406 Some(Err(e)) if e.is_panic() => {
407 tracing::error!(err = %e, "crawl fetch task panicked");
408 continue;
409 }
410 Some(Err(e)) => {
411 tracing::warn!(err = %e, "crawl fetch task cancelled");
412 continue;
413 }
414 };
415
416 let FetchOutcome {
417 url,
418 depth,
419 result,
420 fetched_at,
421 } = outcome;
422 let page = match result {
423 Ok(p) => p,
424 Err(err) => {
425 on_page(error_result(&url, depth, err, fetched_at));
426 completed += 1;
427 continue;
428 }
429 };
430
431 let budget_used = completed + in_flight.len() + 1;
432 let mut ctx = CrawlContext {
433 frontier: &mut frontier,
434 robots: &robots,
435 opts: &opts,
436 };
437 if let Some(r) = process_ok_fetch(&mut ctx, &url, depth, &page, budget_used, fetched_at) {
438 on_page(r);
439 completed += 1;
440 }
441 }
442}
443
444fn spawn_fetch(
445 in_flight: &mut JoinSet<FetchOutcome>,
446 fetcher: &(impl PageFetcher + Clone),
447 opts: &CrawlPlan,
448 url: Url,
449 depth: usize,
450) {
451 let url_str = url.to_string();
452 let timeout = opts.timeout_secs;
453 let settle = opts.settle_ms;
454 let user_agent = opts.user_agent.clone();
455 let cookies = opts.cookies.clone();
456 let f = fetcher.clone();
457 in_flight.spawn_blocking(move || {
458 let result = f
459 .fetch_page(bridge::FetchOptions {
460 url: &url_str,
461 timeout_secs: timeout,
462 settle_ms: settle,
463 mode: bridge::FetchMode::Content { include_a11y: false },
464 user_agent: user_agent.as_deref(),
465 cookies: &cookies,
466 })
467 .map_err(|e| crate::error::Error::engine(e, Some(url_str.clone())));
468 FetchOutcome {
469 url,
470 depth,
471 result,
472 fetched_at: SystemTime::now(),
473 }
474 });
475}
476
477struct CrawlContext<'a> {
479 frontier: &'a mut Frontier,
480 robots: &'a RobotsPolicy,
481 opts: &'a CrawlPlan,
482}
483
484fn process_ok_fetch(
486 ctx: &mut CrawlContext<'_>,
487 url: &Url,
488 depth: usize,
489 page: &bridge::ServoPage,
490 budget_used: usize,
491 fetched_at: SystemTime,
492) -> Option<CrawlPageResult> {
493 let html = if page.html.len() > MAX_HTML_BYTES {
494 &page.html[..crate::sanitize::floor_char_boundary(&page.html, MAX_HTML_BYTES)]
495 } else {
496 &page.html
497 };
498
499 let input = crate::extract::ExtractInput::new(html, url.as_str())
500 .with_layout_json(page.layout_json.as_deref())
501 .with_inner_text(page.inner_text.as_deref())
502 .with_selector(ctx.opts.selector.as_deref());
503
504 let content = if ctx.opts.json {
505 crate::extract::extract_json(&input).ok()
506 } else {
507 crate::extract::extract_text(&input).ok()
508 };
509
510 if content.as_ref().is_some_and(|c| ctx.frontier.is_duplicate_content(c)) {
511 return None;
512 }
513
514 let links = extract_links_from_html(html, url);
515 let links_found = links.len();
516
517 if depth < ctx.opts.max_depth {
518 for link in &links {
519 if budget_used + ctx.frontier.pending() >= ctx.opts.limit {
520 break;
521 }
522 if !is_same_site(&ctx.opts.seed, link)
523 || net::validate_url_with_policy(link.as_str(), bridge::engine_policy()).is_err()
524 || !ctx.robots.is_allowed(link)
525 || !matches_scope(link, ctx.opts.include.as_ref(), ctx.opts.exclude.as_ref())
526 {
527 continue;
528 }
529 ctx.frontier.try_enqueue(link.clone(), depth + 1);
530 }
531 }
532
533 let title = {
534 let doc = dom_query::Document::from(html);
535 let t = doc.select("title").text().to_string();
536 (!t.is_empty()).then_some(t)
537 };
538
539 Some(CrawlPageResult {
540 url: url.to_string(),
541 depth,
542 status: CrawlStatus::Ok,
543 title,
544 content: content.map(|c| crate::sanitize::sanitize(&c).into_owned()),
545 error: None,
546 links_found,
547 fetched_at,
548 })
549}
550
551struct FetchOutcome {
553 url: Url,
554 depth: usize,
555 result: Result<bridge::ServoPage, crate::error::Error>,
556 fetched_at: SystemTime,
557}
558
559fn error_result(url: &Url, depth: usize, error: crate::error::Error, fetched_at: SystemTime) -> CrawlPageResult {
560 CrawlPageResult {
561 url: url.to_string(),
562 depth,
563 status: CrawlStatus::Error,
564 title: None,
565 content: None,
566 error: Some(error),
567 links_found: 0,
568 fetched_at,
569 }
570}
571
572#[cfg(test)]
573mod tests {
574 use std::collections::HashMap;
575 use std::sync::Arc;
576
577 use super::*;
578
579 #[test]
580 fn crawl_options_defaults() {
581 let opts = CrawlOptions::new("https://example.com");
582 assert_eq!(opts.url, "https://example.com");
583 assert_eq!(opts.limit, 50);
584 assert_eq!(opts.max_depth, 3);
585 assert_eq!(opts.timeout, Duration::from_secs(30));
586 assert!(opts.include.is_empty());
587 assert!(opts.exclude.is_empty());
588 assert_eq!(opts.concurrency, 1);
589 assert_eq!(opts.delay, Some(Duration::from_millis(500)));
590 }
591
592 #[test]
593 fn crawl_options_chaining() {
594 let opts = CrawlOptions::new("https://example.com")
595 .limit(100)
596 .max_depth(5)
597 .timeout(Duration::from_secs(60))
598 .include(&["/docs/**"])
599 .exclude(&["/docs/archive/**"])
600 .concurrency(4)
601 .delay(None);
602 assert_eq!(opts.limit, 100);
603 assert_eq!(opts.max_depth, 5);
604 assert_eq!(opts.include, vec!["/docs/**"]);
605 assert_eq!(opts.exclude, vec!["/docs/archive/**"]);
606 assert_eq!(opts.concurrency, 4);
607 assert_eq!(opts.delay, None);
608 }
609
610 #[test]
611 fn crawl_options_concurrency_clamps_below_one() {
612 let opts = CrawlOptions::new("https://example.com").concurrency(0);
613 assert_eq!(opts.concurrency, 1);
614 }
615
616 #[test]
617 fn crawl_options_delay_custom_value() {
618 let opts = CrawlOptions::new("https://example.com").delay(Some(Duration::from_secs(2)));
619 assert_eq!(opts.delay, Some(Duration::from_secs(2)));
620 }
621
622 #[test]
623 fn crawl_user_agent_sanitizes_crlf() {
624 let opts = CrawlOptions::new("https://example.com").user_agent("Crawler\r\n/2.0");
625 assert_eq!(opts.user_agent.as_deref(), Some("Crawler /2.0"));
626 }
627
628 #[derive(Clone)]
629 struct MockFetcher(Arc<HashMap<String, String>>);
630
631 impl MockFetcher {
632 fn new(pages: &[(&str, &str)]) -> Self {
633 Self(Arc::new(
634 pages.iter().map(|(u, h)| (u.to_string(), h.to_string())).collect(),
635 ))
636 }
637 }
638
639 impl PageFetcher for MockFetcher {
640 fn fetch_page(&self, opts: bridge::FetchOptions<'_>) -> Result<bridge::ServoPage, bridge::EngineError> {
641 self.0
642 .get(opts.url)
643 .map(|html| bridge::ServoPage {
644 html: html.clone(),
645 ..Default::default()
646 })
647 .ok_or_else(|| bridge::EngineError::Other(anyhow::anyhow!("not found: {}", opts.url)))
648 }
649 }
650
651 fn page(links: &[&str]) -> String {
652 use std::fmt::Write as _;
653 let mut anchors = String::new();
654 for l in links {
655 write!(anchors, r#"<a href="{l}">link</a>"#).unwrap();
656 }
657 format!("<html><head><title>Test</title></head><body>{anchors}</body></html>")
658 }
659
660 fn distinct_page(tag: &str) -> String {
662 format!("<html><head><title>{tag}</title></head><body>page {tag}</body></html>")
663 }
664
665 async fn check(
667 pages: &[(&str, &str)],
668 configure: impl FnOnce(&mut CrawlPlan),
669 assert: impl FnOnce(&[CrawlPageResult]),
670 ) {
671 let fetcher = MockFetcher::new(pages);
672 let seed = pages[0].0;
673 let mut opts = CrawlPlan {
674 seed: Url::parse(seed).unwrap(),
675 limit: 50,
676 max_depth: 3,
677 timeout_secs: 30,
678 settle_ms: 0,
679 include: None,
680 exclude: None,
681 selector: None,
682 json: false,
683 user_agent: None,
684 concurrency: 1,
685 delay: None,
686 cookies: Vec::new(),
687 };
688 configure(&mut opts);
689 let mut results = Vec::new();
690 run(opts, RobotsPolicy::Unavailable, &fetcher, |r| results.push(r)).await;
691 assert(&results);
692 }
693
694 #[tokio::test]
695 async fn crawl_single_page() {
696 check(
697 &[("https://example.com/", &page(&[]))],
698 |_| {},
699 |r| {
700 assert_eq!(r.len(), 1);
701 assert_eq!(r[0].url, "https://example.com/");
702 },
703 )
704 .await;
705 }
706
707 #[tokio::test]
708 async fn crawl_follows_links() {
709 check(
710 &[
711 ("https://example.com/", &page(&["/a", "/b"])),
712 (
713 "https://example.com/a",
714 "<html><head><title>A</title></head><body>page a</body></html>",
715 ),
716 (
717 "https://example.com/b",
718 "<html><head><title>B</title></head><body>page b</body></html>",
719 ),
720 ],
721 |_| {},
722 |r| assert_eq!(r.len(), 3),
723 )
724 .await;
725 }
726
727 #[tokio::test]
728 async fn crawl_respects_depth_limit() {
729 check(
730 &[
731 ("https://example.com/", &page(&["/a"])),
732 ("https://example.com/a", &page(&["/b"])),
733 ("https://example.com/b", &page(&["/c"])),
734 ("https://example.com/c", &page(&[])),
735 ],
736 |o| o.max_depth = 1,
737 |r| assert_eq!(r.len(), 2),
738 )
739 .await;
740 }
741
742 #[tokio::test]
743 async fn crawl_respects_limit() {
744 check(
745 &[
746 ("https://example.com/", &page(&["/a", "/b", "/c"])),
747 ("https://example.com/a", &page(&[])),
748 ("https://example.com/b", &page(&[])),
749 ("https://example.com/c", &page(&[])),
750 ],
751 |o| o.limit = 2,
752 |r| assert_eq!(r.len(), 2),
753 )
754 .await;
755 }
756
757 #[tokio::test]
758 async fn crawl_skips_cross_site_links() {
759 check(
760 &[
761 ("https://example.com/", &page(&["https://other.com/x"])),
762 ("https://other.com/x", &page(&[])),
763 ],
764 |_| {},
765 |r| assert_eq!(r.len(), 1),
766 )
767 .await;
768 }
769
770 #[tokio::test]
771 async fn crawl_deduplicates_urls() {
772 check(
773 &[
774 ("https://example.com/", &page(&["/a", "/a", "/a"])),
775 ("https://example.com/a", &page(&["/"])),
776 ],
777 |_| {},
778 |r| assert_eq!(r.len(), 2),
779 )
780 .await;
781 }
782
783 #[tokio::test]
784 async fn crawl_handles_fetch_errors() {
785 check(
786 &[("https://example.com/", &page(&["/missing"]))],
787 |_| {},
788 |r| {
789 assert_eq!(r.len(), 2);
790 assert!(matches!(r[1].status, CrawlStatus::Error));
791 assert!(r[1].error.is_some());
792 },
793 )
794 .await;
795 }
796
797 #[tokio::test]
798 async fn crawl_applies_include_glob() {
799 check(
800 &[
801 ("https://example.com/", &page(&["/docs/a", "/blog/b"])),
802 ("https://example.com/docs/a", &page(&[])),
803 ("https://example.com/blog/b", &page(&[])),
804 ],
805 |o| o.include = Some(crate::scope::build_globset(&["/docs/**".into()]).unwrap()),
806 |r| {
807 assert_eq!(r.len(), 2);
808 assert!(r.iter().any(|p| p.url == "https://example.com/docs/a"));
809 assert!(!r.iter().any(|p| p.url == "https://example.com/blog/b"));
810 },
811 )
812 .await;
813 }
814
815 #[tokio::test]
816 async fn crawl_applies_exclude_glob() {
817 check(
818 &[
819 ("https://example.com/", &page(&["/public", "/secret/data"])),
820 ("https://example.com/public", &page(&[])),
821 ("https://example.com/secret/data", &page(&[])),
822 ],
823 |o| o.exclude = Some(crate::scope::build_globset(&["/secret/**".into()]).unwrap()),
824 |r| {
825 assert_eq!(r.len(), 2);
826 assert!(!r.iter().any(|p| p.url == "https://example.com/secret/data"));
827 },
828 )
829 .await;
830 }
831
832 #[tokio::test]
833 async fn crawl_deduplicates_content() {
834 let same = "<html><head><title>Same</title></head><body>identical</body></html>";
835 check(
836 &[
837 ("https://example.com/", &page(&["/a", "/b"])),
838 ("https://example.com/a", same),
839 ("https://example.com/b", same),
840 ],
841 |_| {},
842 |r| assert_eq!(r.len(), 2),
843 )
844 .await;
845 }
846
847 #[tokio::test]
848 async fn crawl_concurrency_visits_all_pages() {
849 check(
850 &[
851 ("https://example.com/", &page(&["/a", "/b", "/c", "/d"])),
852 ("https://example.com/a", &distinct_page("a")),
853 ("https://example.com/b", &distinct_page("b")),
854 ("https://example.com/c", &distinct_page("c")),
855 ("https://example.com/d", &distinct_page("d")),
856 ],
857 |o| o.concurrency = 4,
858 |r| {
859 assert_eq!(r.len(), 5);
860 let urls: HashSet<&str> = r.iter().map(|p| p.url.as_str()).collect();
861 for u in [
862 "https://example.com/",
863 "https://example.com/a",
864 "https://example.com/b",
865 "https://example.com/c",
866 "https://example.com/d",
867 ] {
868 assert!(urls.contains(u), "missing {u}");
869 }
870 },
871 )
872 .await;
873 }
874
875 #[tokio::test]
876 async fn crawl_concurrency_respects_limit() {
877 check(
878 &[
879 ("https://example.com/", &page(&["/a", "/b", "/c", "/d"])),
880 ("https://example.com/a", &distinct_page("a")),
881 ("https://example.com/b", &distinct_page("b")),
882 ("https://example.com/c", &distinct_page("c")),
883 ("https://example.com/d", &distinct_page("d")),
884 ],
885 |o| {
886 o.concurrency = 4;
887 o.limit = 3;
888 },
889 |r| assert_eq!(r.len(), 3),
890 )
891 .await;
892 }
893
894 #[tokio::test]
895 async fn crawl_concurrency_one_preserves_bfs_order() {
896 check(
897 &[
898 ("https://example.com/", &page(&["/a", "/b"])),
899 ("https://example.com/a", &distinct_page("a")),
900 ("https://example.com/b", &distinct_page("b")),
901 ],
902 |o| o.concurrency = 1,
903 |r| {
904 assert_eq!(r.len(), 3);
905 assert_eq!(r[0].url, "https://example.com/");
906 assert_eq!(r[1].url, "https://example.com/a");
907 assert_eq!(r[2].url, "https://example.com/b");
908 },
909 )
910 .await;
911 }
912
913 #[tokio::test(start_paused = true)]
914 async fn crawl_delay_enforces_minimum_interval() {
915 let start = tokio::time::Instant::now();
917 check(
918 &[
919 ("https://example.com/", &page(&["/a", "/b"])),
920 ("https://example.com/a", &distinct_page("a")),
921 ("https://example.com/b", &distinct_page("b")),
922 ],
923 |o| {
924 o.concurrency = 1;
925 o.delay = Some(Duration::from_millis(500));
926 },
927 |r| assert_eq!(r.len(), 3),
928 )
929 .await;
930 let elapsed = start.elapsed();
931 assert!(
932 elapsed >= Duration::from_secs(1),
933 "expected >= 1s for 3 pages with 500ms delay, got {elapsed:?}"
934 );
935 }
936
937 #[test]
938 fn frontier_dedup() {
939 let seed = Url::parse("https://example.com/").unwrap();
940 let mut f = Frontier::new(&seed);
941 assert!(!f.try_enqueue(seed, 0));
942 let other = Url::parse("https://example.com/page").unwrap();
943 assert!(f.try_enqueue(other.clone(), 1));
944 assert!(!f.try_enqueue(other, 1));
945 }
946
947 #[test]
948 fn frontier_pop_and_pending() {
949 let seed = Url::parse("https://example.com/").unwrap();
950 let mut f = Frontier::new(&seed);
951 assert_eq!(f.pending(), 1);
952 let (url, depth) = f.pop().unwrap();
953 assert_eq!(url.as_str(), "https://example.com/");
954 assert_eq!(depth, 0);
955 assert_eq!(f.pending(), 0);
956 assert!(f.pop().is_none());
957 }
958
959 #[test]
960 fn extract_links_filters_dangerous_schemes() {
961 let html = r#"<a href="https://example.com/a">A</a>
962 <a href="javascript:void(0)">JS</a>
963 <a href="JAVASCRIPT:alert(1)">JS upper</a>
964 <a href="data:text/html,<h1>hi</h1>">Data</a>
965 <a href="mailto:x@y.com">Mail</a>
966 <a href="/relative">Rel</a>"#;
967 let base = Url::parse("https://example.com/").unwrap();
968 let links = extract_links_from_html(html, &base);
969 assert_eq!(links.len(), 2);
970 assert_eq!(links[0].as_str(), "https://example.com/a");
971 assert_eq!(links[1].as_str(), "https://example.com/relative");
972 }
973
974 #[test]
975 fn error_result_fields() {
976 let url = Url::parse("https://example.com/fail").unwrap();
977 let r = error_result(&url, 2, crate::error::Error::engine("timeout", None), SystemTime::now());
978 assert!(matches!(r.status, CrawlStatus::Error));
979 assert!(r.error.as_ref().is_some_and(|e| e.to_string().contains("timeout")));
980 assert!(r.content.is_none());
981 }
982
983 #[test]
984 fn content_hash_dedup() {
985 let seed = Url::parse("https://example.com/").unwrap();
986 let mut f = Frontier::new(&seed);
987 assert!(!f.is_duplicate_content("unique content"));
988 assert!(f.is_duplicate_content("unique content"));
989 assert!(!f.is_duplicate_content("different content"));
990 }
991}