1use std::collections::{HashSet, VecDeque};
4use std::hash::{DefaultHasher, Hash, Hasher};
5use std::time::{Duration, SystemTime};
6
7use tokio::task::{JoinSet, spawn_blocking};
8use tokio::time::{MissedTickBehavior, interval};
9use url::Url;
10
11use crate::bridge::{self, PageFetcher};
12use crate::net;
13use crate::robots::RobotsPolicy;
14use crate::scope::{is_same_site, matches_scope, normalize_url};
15
16const MAX_HTML_BYTES: usize = 2 * 1024 * 1024;
17
18#[must_use = "options do nothing until passed to crawl() or crawl_each()"]
20#[derive(Debug, Clone)]
21pub struct CrawlOptions {
22 pub(crate) url: String,
23 pub(crate) limit: usize,
24 pub(crate) max_depth: usize,
25 pub(crate) timeout: Duration,
26 pub(crate) settle: Duration,
27 pub(crate) include: Vec<String>,
28 pub(crate) exclude: Vec<String>,
29 pub(crate) selector: Option<String>,
30 pub(crate) json: bool,
31 pub(crate) user_agent: Option<String>,
32 pub(crate) concurrency: usize,
33 pub(crate) delay: Option<Duration>,
34}
35
36impl CrawlOptions {
37 pub fn new(url: &str) -> Self {
39 Self {
40 url: url.into(),
41 limit: 50,
42 max_depth: 3,
43 timeout: Duration::from_secs(30),
44 settle: Duration::ZERO,
45 include: Vec::new(),
46 exclude: Vec::new(),
47 selector: None,
48 json: false,
49 user_agent: None,
50 concurrency: 1,
51 delay: Some(Duration::from_millis(500)),
52 }
53 }
54
55 pub fn limit(mut self, n: usize) -> Self {
57 self.limit = n;
58 self
59 }
60
61 pub fn max_depth(mut self, n: usize) -> Self {
63 self.max_depth = n;
64 self
65 }
66
67 pub fn timeout(mut self, timeout: Duration) -> Self {
69 self.timeout = timeout;
70 self
71 }
72
73 pub fn settle(mut self, settle: Duration) -> Self {
75 self.settle = settle;
76 self
77 }
78
79 pub fn include(mut self, patterns: &[&str]) -> Self {
81 self.include = patterns.iter().map(|s| (*s).to_string()).collect();
82 self
83 }
84
85 pub fn exclude(mut self, patterns: &[&str]) -> Self {
87 self.exclude = patterns.iter().map(|s| (*s).to_string()).collect();
88 self
89 }
90
91 pub fn json(mut self, json: bool) -> Self {
93 self.json = json;
94 self
95 }
96
97 pub fn selector(mut self, selector: impl Into<String>) -> Self {
99 self.selector = Some(selector.into());
100 self
101 }
102
103 pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
105 self.user_agent = Some(net::sanitize_user_agent(ua.into()));
106 self
107 }
108
109 pub fn concurrency(mut self, n: usize) -> Self {
112 self.concurrency = n.max(1);
113 self
114 }
115
116 pub fn delay(mut self, delay: Option<Duration>) -> Self {
118 self.delay = delay;
119 self
120 }
121}
122
123#[derive(Debug)]
125#[non_exhaustive]
126pub struct CrawlResult {
127 pub url: String,
129 pub depth: usize,
131 pub fetched_at: SystemTime,
133 pub outcome: Result<CrawlPage, crate::error::Error>,
135}
136
137#[derive(Debug, Clone)]
139pub struct CrawlPage {
140 pub title: Option<String>,
142 pub content: String,
144 pub links_found: usize,
146}
147
148impl serde::Serialize for CrawlResult {
149 fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
150 use serde::ser::SerializeMap;
151 let fetched_at = humantime::format_rfc3339_millis(self.fetched_at).to_string();
152 match &self.outcome {
153 Ok(page) => {
154 let mut map = serializer.serialize_map(None)?;
155 map.serialize_entry("type", "page")?;
156 map.serialize_entry("url", &self.url)?;
157 map.serialize_entry("depth", &self.depth)?;
158 map.serialize_entry("fetched_at", &fetched_at)?;
159 if let Some(t) = &page.title {
160 map.serialize_entry("title", t)?;
161 }
162 map.serialize_entry("content", &page.content)?;
163 map.serialize_entry("links_found", &page.links_found)?;
164 map.end()
165 }
166 Err(e) => {
167 let mut map = serializer.serialize_map(None)?;
168 map.serialize_entry("type", "error")?;
169 map.serialize_entry("url", &self.url)?;
170 map.serialize_entry("depth", &self.depth)?;
171 map.serialize_entry("fetched_at", &fetched_at)?;
172 map.serialize_entry("error", &e.to_string())?;
173 map.end()
174 }
175 }
176 }
177}
178
179impl CrawlResult {
180 fn from_internal(r: CrawlPageResult) -> Self {
181 let outcome = match r.status {
182 CrawlStatus::Ok => Ok(CrawlPage {
183 title: r.title,
184 content: r.content.unwrap_or_default(),
185 links_found: r.links_found,
186 }),
187 CrawlStatus::Error => Err(r
188 .error
189 .unwrap_or_else(|| crate::error::Error::engine("unknown crawl error", None))),
190 };
191 Self {
192 url: r.url,
193 depth: r.depth,
194 fetched_at: r.fetched_at,
195 outcome,
196 }
197 }
198}
199
200#[allow(clippy::needless_pass_by_value)]
202pub fn crawl_each(opts: CrawlOptions, mut on_page: impl FnMut(CrawlResult)) -> crate::error::Result<()> {
203 net::ensure_crypto_provider();
204 let plan = build_crawl_plan(&opts)?;
205 crate::runtime::block_on(async {
206 let robots = spawn_blocking({
207 let seed = plan.seed.clone();
208 let user_agent = plan.user_agent.clone();
209 let timeout = Duration::from_secs(plan.timeout_secs);
210 move || crate::robots::RobotsRules::fetch(&seed, user_agent.as_deref(), timeout)
211 })
212 .await
213 .unwrap_or(RobotsPolicy::Unreachable);
214 run(plan, robots, &bridge::ServoFetcher, |r| {
215 on_page(CrawlResult::from_internal(r));
216 })
217 .await;
218 })
219 .map_err(|e| crate::error::Error::engine(e, None))
220}
221
222#[allow(clippy::needless_pass_by_value)]
224pub fn crawl(opts: CrawlOptions) -> crate::error::Result<Vec<CrawlResult>> {
225 let mut results = Vec::new();
226 crawl_each(opts, |r| results.push(r))?;
227 Ok(results)
228}
229
230fn build_crawl_plan(opts: &CrawlOptions) -> crate::error::Result<CrawlPlan> {
231 let seed = net::validate_url(&opts.url)?;
232 let include = if opts.include.is_empty() {
233 None
234 } else {
235 Some(crate::scope::build_globset(&opts.include)?)
236 };
237 let exclude = if opts.exclude.is_empty() {
238 None
239 } else {
240 Some(crate::scope::build_globset(&opts.exclude)?)
241 };
242 Ok(CrawlPlan {
243 seed,
244 limit: opts.limit,
245 max_depth: opts.max_depth,
246 timeout_secs: opts.timeout.as_secs().max(1),
247 settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
248 include,
249 exclude,
250 selector: opts.selector.clone(),
251 json: opts.json,
252 user_agent: opts.user_agent.clone(),
253 concurrency: opts.concurrency,
254 delay: opts.delay,
255 })
256}
257
258pub(crate) struct CrawlPlan {
260 pub seed: Url,
261 pub limit: usize,
262 pub max_depth: usize,
263 pub timeout_secs: u64,
264 pub settle_ms: u64,
265 pub include: Option<globset::GlobSet>,
266 pub exclude: Option<globset::GlobSet>,
267 pub selector: Option<String>,
268 pub json: bool,
269 pub user_agent: Option<String>,
270 pub concurrency: usize,
272 pub delay: Option<Duration>,
274}
275
276pub(crate) struct CrawlPageResult {
278 pub url: String,
279 pub depth: usize,
280 pub status: CrawlStatus,
281 pub title: Option<String>,
282 pub content: Option<String>,
283 pub error: Option<crate::error::Error>,
284 pub links_found: usize,
285 pub fetched_at: SystemTime,
286}
287
288pub(crate) enum CrawlStatus {
290 Ok,
291 Error,
292}
293
294struct Frontier {
295 queue: VecDeque<(Url, usize)>,
296 visited: HashSet<String>,
297 content_hashes: HashSet<u64>,
298}
299
300impl Frontier {
301 fn new(seed: &Url) -> Self {
302 Self {
303 queue: VecDeque::from([(seed.clone(), 0)]),
304 visited: HashSet::from([normalize_url(seed)]),
305 content_hashes: HashSet::new(),
306 }
307 }
308
309 fn try_enqueue(&mut self, url: Url, depth: usize) -> bool {
310 if self.visited.insert(normalize_url(&url)) {
311 self.queue.push_back((url, depth));
312 true
313 } else {
314 false
315 }
316 }
317
318 fn pop(&mut self) -> Option<(Url, usize)> {
319 self.queue.pop_front()
320 }
321
322 fn is_duplicate_content(&mut self, content: &str) -> bool {
323 let mut h = DefaultHasher::new();
324 content.hash(&mut h);
325 !self.content_hashes.insert(h.finish())
326 }
327
328 fn pending(&self) -> usize {
329 self.queue.len()
330 }
331}
332
333fn extract_links_from_html(html: &str, base: &Url) -> Vec<Url> {
334 dom_query::Document::from(html)
335 .select("a[href]")
336 .iter()
337 .filter_map(|el| {
338 let href = el.attr("href")?;
339 let href = href.trim();
340 if href.is_empty() {
341 return None;
342 }
343 let resolved = base.join(href).ok()?;
344 matches!(resolved.scheme(), "http" | "https").then_some(resolved)
345 })
346 .collect()
347}
348
349pub(crate) async fn run(
350 opts: CrawlPlan,
351 robots: RobotsPolicy,
352 fetcher: &(impl PageFetcher + Clone),
353 mut on_page: impl FnMut(CrawlPageResult),
354) {
355 let mut frontier = Frontier::new(&opts.seed);
356 let mut completed: usize = 0;
357 let mut in_flight: JoinSet<FetchOutcome> = JoinSet::new();
358
359 let mut ticker = opts.delay.map(|period| {
361 let mut t = interval(period);
362 t.set_missed_tick_behavior(MissedTickBehavior::Delay);
363 t
364 });
365
366 let concurrency = opts.concurrency.max(1);
367
368 loop {
369 while in_flight.len() < concurrency && completed + in_flight.len() < opts.limit {
370 let Some((url, depth)) = frontier.pop() else {
371 break;
372 };
373 if let Some(t) = ticker.as_mut() {
374 t.tick().await;
375 }
376 spawn_fetch(&mut in_flight, fetcher, &opts, url, depth);
377 }
378
379 let outcome = match in_flight.join_next().await {
380 None => break,
381 Some(Ok(o)) => o,
382 Some(Err(e)) if e.is_panic() => {
383 tracing::error!(err = %e, "crawl fetch task panicked");
384 continue;
385 }
386 Some(Err(e)) => {
387 tracing::warn!(err = %e, "crawl fetch task cancelled");
388 continue;
389 }
390 };
391
392 let FetchOutcome {
393 url,
394 depth,
395 result,
396 fetched_at,
397 } = outcome;
398 let page = match result {
399 Ok(p) => p,
400 Err(err) => {
401 on_page(error_result(&url, depth, err, fetched_at));
402 completed += 1;
403 continue;
404 }
405 };
406
407 let budget_used = completed + in_flight.len() + 1;
408 let mut ctx = CrawlContext {
409 frontier: &mut frontier,
410 robots: &robots,
411 opts: &opts,
412 };
413 if let Some(r) = process_ok_fetch(&mut ctx, &url, depth, &page, budget_used, fetched_at) {
414 on_page(r);
415 completed += 1;
416 }
417 }
418}
419
420fn spawn_fetch(
421 in_flight: &mut JoinSet<FetchOutcome>,
422 fetcher: &(impl PageFetcher + Clone),
423 opts: &CrawlPlan,
424 url: Url,
425 depth: usize,
426) {
427 let url_str = url.to_string();
428 let timeout = opts.timeout_secs;
429 let settle = opts.settle_ms;
430 let user_agent = opts.user_agent.clone();
431 let f = fetcher.clone();
432 in_flight.spawn_blocking(move || {
433 let result = f
434 .fetch_page(bridge::FetchOptions {
435 url: &url_str,
436 timeout_secs: timeout,
437 settle_ms: settle,
438 mode: bridge::FetchMode::Content { include_a11y: false },
439 user_agent: user_agent.as_deref(),
440 })
441 .map_err(|e| crate::error::Error::engine(e, Some(url_str.clone())));
442 FetchOutcome {
443 url,
444 depth,
445 result,
446 fetched_at: SystemTime::now(),
447 }
448 });
449}
450
451struct CrawlContext<'a> {
453 frontier: &'a mut Frontier,
454 robots: &'a RobotsPolicy,
455 opts: &'a CrawlPlan,
456}
457
458fn process_ok_fetch(
460 ctx: &mut CrawlContext<'_>,
461 url: &Url,
462 depth: usize,
463 page: &bridge::ServoPage,
464 budget_used: usize,
465 fetched_at: SystemTime,
466) -> Option<CrawlPageResult> {
467 let html = if page.html.len() > MAX_HTML_BYTES {
468 &page.html[..crate::sanitize::floor_char_boundary(&page.html, MAX_HTML_BYTES)]
469 } else {
470 &page.html
471 };
472
473 let input = crate::extract::ExtractInput::new(html, url.as_str())
474 .with_layout_json(page.layout_json.as_deref())
475 .with_inner_text(page.inner_text.as_deref())
476 .with_selector(ctx.opts.selector.as_deref());
477
478 let content = if ctx.opts.json {
479 crate::extract::extract_json(&input).ok()
480 } else {
481 crate::extract::extract_text(&input).ok()
482 };
483
484 if content.as_ref().is_some_and(|c| ctx.frontier.is_duplicate_content(c)) {
485 return None;
486 }
487
488 let links = extract_links_from_html(html, url);
489 let links_found = links.len();
490
491 if depth < ctx.opts.max_depth {
492 for link in &links {
493 if budget_used + ctx.frontier.pending() >= ctx.opts.limit {
494 break;
495 }
496 if !is_same_site(&ctx.opts.seed, link)
497 || net::validate_url_with_policy(link.as_str(), bridge::engine_policy()).is_err()
498 || !ctx.robots.is_allowed(link)
499 || !matches_scope(link, ctx.opts.include.as_ref(), ctx.opts.exclude.as_ref())
500 {
501 continue;
502 }
503 ctx.frontier.try_enqueue(link.clone(), depth + 1);
504 }
505 }
506
507 let title = {
508 let doc = dom_query::Document::from(html);
509 let t = doc.select("title").text().to_string();
510 (!t.is_empty()).then_some(t)
511 };
512
513 Some(CrawlPageResult {
514 url: url.to_string(),
515 depth,
516 status: CrawlStatus::Ok,
517 title,
518 content: content.map(|c| crate::sanitize::sanitize(&c).into_owned()),
519 error: None,
520 links_found,
521 fetched_at,
522 })
523}
524
525struct FetchOutcome {
527 url: Url,
528 depth: usize,
529 result: Result<bridge::ServoPage, crate::error::Error>,
530 fetched_at: SystemTime,
531}
532
533fn error_result(url: &Url, depth: usize, error: crate::error::Error, fetched_at: SystemTime) -> CrawlPageResult {
534 CrawlPageResult {
535 url: url.to_string(),
536 depth,
537 status: CrawlStatus::Error,
538 title: None,
539 content: None,
540 error: Some(error),
541 links_found: 0,
542 fetched_at,
543 }
544}
545
546#[cfg(test)]
547mod tests {
548 use std::collections::HashMap;
549 use std::sync::Arc;
550
551 use super::*;
552
553 #[test]
554 fn crawl_options_defaults() {
555 let opts = CrawlOptions::new("https://example.com");
556 assert_eq!(opts.url, "https://example.com");
557 assert_eq!(opts.limit, 50);
558 assert_eq!(opts.max_depth, 3);
559 assert_eq!(opts.timeout, Duration::from_secs(30));
560 assert!(opts.include.is_empty());
561 assert!(opts.exclude.is_empty());
562 assert_eq!(opts.concurrency, 1);
563 assert_eq!(opts.delay, Some(Duration::from_millis(500)));
564 }
565
566 #[test]
567 fn crawl_options_chaining() {
568 let opts = CrawlOptions::new("https://example.com")
569 .limit(100)
570 .max_depth(5)
571 .timeout(Duration::from_secs(60))
572 .include(&["/docs/**"])
573 .exclude(&["/docs/archive/**"])
574 .concurrency(4)
575 .delay(None);
576 assert_eq!(opts.limit, 100);
577 assert_eq!(opts.max_depth, 5);
578 assert_eq!(opts.include, vec!["/docs/**"]);
579 assert_eq!(opts.exclude, vec!["/docs/archive/**"]);
580 assert_eq!(opts.concurrency, 4);
581 assert_eq!(opts.delay, None);
582 }
583
584 #[test]
585 fn crawl_options_concurrency_clamps_below_one() {
586 let opts = CrawlOptions::new("https://example.com").concurrency(0);
587 assert_eq!(opts.concurrency, 1);
588 }
589
590 #[test]
591 fn crawl_options_delay_custom_value() {
592 let opts = CrawlOptions::new("https://example.com").delay(Some(Duration::from_secs(2)));
593 assert_eq!(opts.delay, Some(Duration::from_secs(2)));
594 }
595
596 #[test]
597 fn crawl_user_agent_sanitizes_crlf() {
598 let opts = CrawlOptions::new("https://example.com").user_agent("Crawler\r\n/2.0");
599 assert_eq!(opts.user_agent.as_deref(), Some("Crawler /2.0"));
600 }
601
602 #[derive(Clone)]
603 struct MockFetcher(Arc<HashMap<String, String>>);
604
605 impl MockFetcher {
606 fn new(pages: &[(&str, &str)]) -> Self {
607 Self(Arc::new(
608 pages.iter().map(|(u, h)| (u.to_string(), h.to_string())).collect(),
609 ))
610 }
611 }
612
613 impl PageFetcher for MockFetcher {
614 fn fetch_page(&self, opts: bridge::FetchOptions<'_>) -> anyhow::Result<bridge::ServoPage> {
615 self.0
616 .get(opts.url)
617 .map(|html| bridge::ServoPage {
618 html: html.clone(),
619 ..Default::default()
620 })
621 .ok_or_else(|| anyhow::anyhow!("not found: {}", opts.url))
622 }
623 }
624
625 fn page(links: &[&str]) -> String {
626 use std::fmt::Write as _;
627 let mut anchors = String::new();
628 for l in links {
629 write!(anchors, r#"<a href="{l}">link</a>"#).unwrap();
630 }
631 format!("<html><head><title>Test</title></head><body>{anchors}</body></html>")
632 }
633
634 fn distinct_page(tag: &str) -> String {
636 format!("<html><head><title>{tag}</title></head><body>page {tag}</body></html>")
637 }
638
639 async fn check(
641 pages: &[(&str, &str)],
642 configure: impl FnOnce(&mut CrawlPlan),
643 assert: impl FnOnce(&[CrawlPageResult]),
644 ) {
645 let fetcher = MockFetcher::new(pages);
646 let seed = pages[0].0;
647 let mut opts = CrawlPlan {
648 seed: Url::parse(seed).unwrap(),
649 limit: 50,
650 max_depth: 3,
651 timeout_secs: 30,
652 settle_ms: 0,
653 include: None,
654 exclude: None,
655 selector: None,
656 json: false,
657 user_agent: None,
658 concurrency: 1,
659 delay: None,
660 };
661 configure(&mut opts);
662 let mut results = Vec::new();
663 run(opts, RobotsPolicy::Unavailable, &fetcher, |r| results.push(r)).await;
664 assert(&results);
665 }
666
667 #[tokio::test]
668 async fn crawl_single_page() {
669 check(
670 &[("https://example.com/", &page(&[]))],
671 |_| {},
672 |r| {
673 assert_eq!(r.len(), 1);
674 assert_eq!(r[0].url, "https://example.com/");
675 },
676 )
677 .await;
678 }
679
680 #[tokio::test]
681 async fn crawl_follows_links() {
682 check(
683 &[
684 ("https://example.com/", &page(&["/a", "/b"])),
685 (
686 "https://example.com/a",
687 "<html><head><title>A</title></head><body>page a</body></html>",
688 ),
689 (
690 "https://example.com/b",
691 "<html><head><title>B</title></head><body>page b</body></html>",
692 ),
693 ],
694 |_| {},
695 |r| assert_eq!(r.len(), 3),
696 )
697 .await;
698 }
699
700 #[tokio::test]
701 async fn crawl_respects_depth_limit() {
702 check(
703 &[
704 ("https://example.com/", &page(&["/a"])),
705 ("https://example.com/a", &page(&["/b"])),
706 ("https://example.com/b", &page(&["/c"])),
707 ("https://example.com/c", &page(&[])),
708 ],
709 |o| o.max_depth = 1,
710 |r| assert_eq!(r.len(), 2),
711 )
712 .await;
713 }
714
715 #[tokio::test]
716 async fn crawl_respects_limit() {
717 check(
718 &[
719 ("https://example.com/", &page(&["/a", "/b", "/c"])),
720 ("https://example.com/a", &page(&[])),
721 ("https://example.com/b", &page(&[])),
722 ("https://example.com/c", &page(&[])),
723 ],
724 |o| o.limit = 2,
725 |r| assert_eq!(r.len(), 2),
726 )
727 .await;
728 }
729
730 #[tokio::test]
731 async fn crawl_skips_cross_site_links() {
732 check(
733 &[
734 ("https://example.com/", &page(&["https://other.com/x"])),
735 ("https://other.com/x", &page(&[])),
736 ],
737 |_| {},
738 |r| assert_eq!(r.len(), 1),
739 )
740 .await;
741 }
742
743 #[tokio::test]
744 async fn crawl_deduplicates_urls() {
745 check(
746 &[
747 ("https://example.com/", &page(&["/a", "/a", "/a"])),
748 ("https://example.com/a", &page(&["/"])),
749 ],
750 |_| {},
751 |r| assert_eq!(r.len(), 2),
752 )
753 .await;
754 }
755
756 #[tokio::test]
757 async fn crawl_handles_fetch_errors() {
758 check(
759 &[("https://example.com/", &page(&["/missing"]))],
760 |_| {},
761 |r| {
762 assert_eq!(r.len(), 2);
763 assert!(matches!(r[1].status, CrawlStatus::Error));
764 assert!(r[1].error.is_some());
765 },
766 )
767 .await;
768 }
769
770 #[tokio::test]
771 async fn crawl_applies_include_glob() {
772 check(
773 &[
774 ("https://example.com/", &page(&["/docs/a", "/blog/b"])),
775 ("https://example.com/docs/a", &page(&[])),
776 ("https://example.com/blog/b", &page(&[])),
777 ],
778 |o| o.include = Some(crate::scope::build_globset(&["/docs/**".into()]).unwrap()),
779 |r| {
780 assert_eq!(r.len(), 2);
781 assert!(r.iter().any(|p| p.url == "https://example.com/docs/a"));
782 assert!(!r.iter().any(|p| p.url == "https://example.com/blog/b"));
783 },
784 )
785 .await;
786 }
787
788 #[tokio::test]
789 async fn crawl_applies_exclude_glob() {
790 check(
791 &[
792 ("https://example.com/", &page(&["/public", "/secret/data"])),
793 ("https://example.com/public", &page(&[])),
794 ("https://example.com/secret/data", &page(&[])),
795 ],
796 |o| o.exclude = Some(crate::scope::build_globset(&["/secret/**".into()]).unwrap()),
797 |r| {
798 assert_eq!(r.len(), 2);
799 assert!(!r.iter().any(|p| p.url == "https://example.com/secret/data"));
800 },
801 )
802 .await;
803 }
804
805 #[tokio::test]
806 async fn crawl_deduplicates_content() {
807 let same = "<html><head><title>Same</title></head><body>identical</body></html>";
808 check(
809 &[
810 ("https://example.com/", &page(&["/a", "/b"])),
811 ("https://example.com/a", same),
812 ("https://example.com/b", same),
813 ],
814 |_| {},
815 |r| assert_eq!(r.len(), 2),
816 )
817 .await;
818 }
819
820 #[tokio::test]
821 async fn crawl_concurrency_visits_all_pages() {
822 check(
823 &[
824 ("https://example.com/", &page(&["/a", "/b", "/c", "/d"])),
825 ("https://example.com/a", &distinct_page("a")),
826 ("https://example.com/b", &distinct_page("b")),
827 ("https://example.com/c", &distinct_page("c")),
828 ("https://example.com/d", &distinct_page("d")),
829 ],
830 |o| o.concurrency = 4,
831 |r| {
832 assert_eq!(r.len(), 5);
833 let urls: HashSet<&str> = r.iter().map(|p| p.url.as_str()).collect();
834 for u in [
835 "https://example.com/",
836 "https://example.com/a",
837 "https://example.com/b",
838 "https://example.com/c",
839 "https://example.com/d",
840 ] {
841 assert!(urls.contains(u), "missing {u}");
842 }
843 },
844 )
845 .await;
846 }
847
848 #[tokio::test]
849 async fn crawl_concurrency_respects_limit() {
850 check(
851 &[
852 ("https://example.com/", &page(&["/a", "/b", "/c", "/d"])),
853 ("https://example.com/a", &distinct_page("a")),
854 ("https://example.com/b", &distinct_page("b")),
855 ("https://example.com/c", &distinct_page("c")),
856 ("https://example.com/d", &distinct_page("d")),
857 ],
858 |o| {
859 o.concurrency = 4;
860 o.limit = 3;
861 },
862 |r| assert_eq!(r.len(), 3),
863 )
864 .await;
865 }
866
867 #[tokio::test]
868 async fn crawl_concurrency_one_preserves_bfs_order() {
869 check(
870 &[
871 ("https://example.com/", &page(&["/a", "/b"])),
872 ("https://example.com/a", &distinct_page("a")),
873 ("https://example.com/b", &distinct_page("b")),
874 ],
875 |o| o.concurrency = 1,
876 |r| {
877 assert_eq!(r.len(), 3);
878 assert_eq!(r[0].url, "https://example.com/");
879 assert_eq!(r[1].url, "https://example.com/a");
880 assert_eq!(r[2].url, "https://example.com/b");
881 },
882 )
883 .await;
884 }
885
886 #[tokio::test(start_paused = true)]
887 async fn crawl_delay_enforces_minimum_interval() {
888 let start = tokio::time::Instant::now();
890 check(
891 &[
892 ("https://example.com/", &page(&["/a", "/b"])),
893 ("https://example.com/a", &distinct_page("a")),
894 ("https://example.com/b", &distinct_page("b")),
895 ],
896 |o| {
897 o.concurrency = 1;
898 o.delay = Some(Duration::from_millis(500));
899 },
900 |r| assert_eq!(r.len(), 3),
901 )
902 .await;
903 let elapsed = start.elapsed();
904 assert!(
905 elapsed >= Duration::from_secs(1),
906 "expected >= 1s for 3 pages with 500ms delay, got {elapsed:?}"
907 );
908 }
909
910 #[test]
911 fn frontier_dedup() {
912 let seed = Url::parse("https://example.com/").unwrap();
913 let mut f = Frontier::new(&seed);
914 assert!(!f.try_enqueue(seed, 0));
915 let other = Url::parse("https://example.com/page").unwrap();
916 assert!(f.try_enqueue(other.clone(), 1));
917 assert!(!f.try_enqueue(other, 1));
918 }
919
920 #[test]
921 fn frontier_pop_and_pending() {
922 let seed = Url::parse("https://example.com/").unwrap();
923 let mut f = Frontier::new(&seed);
924 assert_eq!(f.pending(), 1);
925 let (url, depth) = f.pop().unwrap();
926 assert_eq!(url.as_str(), "https://example.com/");
927 assert_eq!(depth, 0);
928 assert_eq!(f.pending(), 0);
929 assert!(f.pop().is_none());
930 }
931
932 #[test]
933 fn extract_links_filters_dangerous_schemes() {
934 let html = r#"<a href="https://example.com/a">A</a>
935 <a href="javascript:void(0)">JS</a>
936 <a href="JAVASCRIPT:alert(1)">JS upper</a>
937 <a href="data:text/html,<h1>hi</h1>">Data</a>
938 <a href="mailto:x@y.com">Mail</a>
939 <a href="/relative">Rel</a>"#;
940 let base = Url::parse("https://example.com/").unwrap();
941 let links = extract_links_from_html(html, &base);
942 assert_eq!(links.len(), 2);
943 assert_eq!(links[0].as_str(), "https://example.com/a");
944 assert_eq!(links[1].as_str(), "https://example.com/relative");
945 }
946
947 #[test]
948 fn error_result_fields() {
949 let url = Url::parse("https://example.com/fail").unwrap();
950 let r = error_result(&url, 2, crate::error::Error::engine("timeout", None), SystemTime::now());
951 assert!(matches!(r.status, CrawlStatus::Error));
952 assert!(r.error.as_ref().is_some_and(|e| e.to_string().contains("timeout")));
953 assert!(r.content.is_none());
954 }
955
956 #[test]
957 fn content_hash_dedup() {
958 let seed = Url::parse("https://example.com/").unwrap();
959 let mut f = Frontier::new(&seed);
960 assert!(!f.is_duplicate_content("unique content"));
961 assert!(f.is_duplicate_content("unique content"));
962 assert!(!f.is_duplicate_content("different content"));
963 }
964}