1use std::collections::{HashSet, VecDeque};
4use std::hash::{Hash, Hasher};
5use std::time::{Duration, SystemTime};
6
7use tokio::task::JoinSet;
8use tokio::time::{MissedTickBehavior, interval};
9use url::Url;
10
11use crate::bridge::{self, PageFetcher};
12use crate::net;
13use crate::robots::RobotsPolicy;
14use crate::scope::{is_same_site, matches_scope, normalize_url};
15
16const MAX_HTML_BYTES: usize = 2 * 1024 * 1024;
17
18#[must_use = "options do nothing until passed to crawl() or crawl_each()"]
20#[derive(Debug, Clone)]
21pub struct CrawlOptions {
22 pub(crate) url: String,
23 pub(crate) limit: usize,
24 pub(crate) max_depth: usize,
25 pub(crate) timeout: Duration,
26 pub(crate) settle: Duration,
27 pub(crate) include: Vec<String>,
28 pub(crate) exclude: Vec<String>,
29 pub(crate) selector: Option<String>,
30 pub(crate) json: bool,
31 pub(crate) user_agent: Option<String>,
32 pub(crate) concurrency: usize,
33 pub(crate) delay: Option<Duration>,
34}
35
36impl CrawlOptions {
37 pub fn new(url: &str) -> Self {
39 Self {
40 url: url.into(),
41 limit: 50,
42 max_depth: 3,
43 timeout: Duration::from_secs(30),
44 settle: Duration::ZERO,
45 include: Vec::new(),
46 exclude: Vec::new(),
47 selector: None,
48 json: false,
49 user_agent: None,
50 concurrency: 1,
51 delay: Some(Duration::from_millis(500)),
52 }
53 }
54
55 pub fn limit(mut self, n: usize) -> Self {
57 self.limit = n;
58 self
59 }
60
61 pub fn max_depth(mut self, n: usize) -> Self {
63 self.max_depth = n;
64 self
65 }
66
67 pub fn timeout(mut self, timeout: Duration) -> Self {
69 self.timeout = timeout;
70 self
71 }
72
73 pub fn settle(mut self, settle: Duration) -> Self {
75 self.settle = settle;
76 self
77 }
78
79 pub fn include(mut self, patterns: &[&str]) -> Self {
81 self.include = patterns.iter().map(|s| (*s).to_string()).collect();
82 self
83 }
84
85 pub fn exclude(mut self, patterns: &[&str]) -> Self {
87 self.exclude = patterns.iter().map(|s| (*s).to_string()).collect();
88 self
89 }
90
91 pub fn json(mut self, json: bool) -> Self {
93 self.json = json;
94 self
95 }
96
97 pub fn selector(mut self, selector: impl Into<String>) -> Self {
99 self.selector = Some(selector.into());
100 self
101 }
102
103 pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
105 self.user_agent = Some(net::sanitize_user_agent(ua.into()));
106 self
107 }
108
109 pub fn concurrency(mut self, n: usize) -> Self {
112 self.concurrency = n.max(1);
113 self
114 }
115
116 pub fn delay(mut self, delay: Option<Duration>) -> Self {
118 self.delay = delay;
119 self
120 }
121}
122
123#[derive(Debug, Clone)]
125#[non_exhaustive]
126pub struct CrawlResult {
127 pub url: String,
129 pub depth: usize,
131 pub fetched_at: SystemTime,
133 pub outcome: Result<CrawlPage, CrawlError>,
135}
136
137#[derive(Debug, Clone)]
139pub struct CrawlPage {
140 pub title: Option<String>,
142 pub content: String,
144 pub links_found: usize,
146}
147
148#[derive(Debug, Clone)]
150pub struct CrawlError {
151 pub message: String,
153}
154
155impl std::fmt::Display for CrawlError {
156 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
157 f.write_str(&self.message)
158 }
159}
160
161impl std::error::Error for CrawlError {}
162
163impl serde::Serialize for CrawlResult {
164 fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
165 use serde::ser::SerializeMap;
166 let fetched_at = humantime::format_rfc3339_millis(self.fetched_at).to_string();
167 match &self.outcome {
168 Ok(page) => {
169 let mut map = serializer.serialize_map(None)?;
170 map.serialize_entry("type", "page")?;
171 map.serialize_entry("url", &self.url)?;
172 map.serialize_entry("depth", &self.depth)?;
173 map.serialize_entry("fetched_at", &fetched_at)?;
174 if let Some(t) = &page.title {
175 map.serialize_entry("title", t)?;
176 }
177 map.serialize_entry("content", &page.content)?;
178 map.serialize_entry("links_found", &page.links_found)?;
179 map.end()
180 }
181 Err(e) => {
182 let mut map = serializer.serialize_map(None)?;
183 map.serialize_entry("type", "error")?;
184 map.serialize_entry("url", &self.url)?;
185 map.serialize_entry("depth", &self.depth)?;
186 map.serialize_entry("fetched_at", &fetched_at)?;
187 map.serialize_entry("error", &e.message)?;
188 map.end()
189 }
190 }
191 }
192}
193
194impl CrawlResult {
195 fn from_internal(r: &CrawlPageResult) -> Self {
196 let outcome = match r.status {
197 CrawlStatus::Ok => Ok(CrawlPage {
198 title: r.title.clone(),
199 content: r.content.clone().unwrap_or_default(),
200 links_found: r.links_found,
201 }),
202 CrawlStatus::Error => Err(CrawlError {
203 message: r.error.clone().unwrap_or_default(),
204 }),
205 };
206 Self {
207 url: r.url.clone(),
208 depth: r.depth,
209 fetched_at: r.fetched_at,
210 outcome,
211 }
212 }
213}
214
215#[allow(clippy::needless_pass_by_value)]
217pub fn crawl_each(opts: CrawlOptions, mut on_page: impl FnMut(&CrawlResult)) -> crate::error::Result<()> {
218 net::ensure_crypto_provider();
219 let plan = build_crawl_plan(&opts)?;
220 crate::runtime::block_on(async {
221 let robots = tokio::task::spawn_blocking({
222 let seed = plan.seed.clone();
223 let user_agent = plan.user_agent.clone();
224 let timeout = Duration::from_secs(plan.timeout_secs);
225 move || crate::robots::RobotsRules::fetch(&seed, user_agent.as_deref(), timeout)
226 })
227 .await
228 .unwrap_or(RobotsPolicy::Unreachable);
229 run(plan, robots, &bridge::ServoFetcher, |r| {
230 on_page(&CrawlResult::from_internal(r));
231 })
232 .await
233 })
234 .map_err(|e| crate::error::Error::Engine(e.to_string()))?;
235 Ok(())
236}
237
238#[allow(clippy::needless_pass_by_value)]
240pub fn crawl(opts: CrawlOptions) -> crate::error::Result<Vec<CrawlResult>> {
241 let mut results = Vec::new();
242 crawl_each(opts, |r| results.push(r.clone()))?;
243 Ok(results)
244}
245
246fn build_crawl_plan(opts: &CrawlOptions) -> crate::error::Result<CrawlPlan> {
247 let seed = net::validate_url(&opts.url)?;
248 let include = if opts.include.is_empty() {
249 None
250 } else {
251 Some(crate::scope::build_globset(&opts.include)?)
252 };
253 let exclude = if opts.exclude.is_empty() {
254 None
255 } else {
256 Some(crate::scope::build_globset(&opts.exclude)?)
257 };
258 Ok(CrawlPlan {
259 seed,
260 limit: opts.limit,
261 max_depth: opts.max_depth,
262 timeout_secs: opts.timeout.as_secs().max(1),
263 settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
264 include,
265 exclude,
266 selector: opts.selector.clone(),
267 json: opts.json,
268 user_agent: opts.user_agent.clone(),
269 concurrency: opts.concurrency,
270 delay: opts.delay,
271 })
272}
273
274pub(crate) struct CrawlPlan {
276 pub seed: Url,
277 pub limit: usize,
278 pub max_depth: usize,
279 pub timeout_secs: u64,
280 pub settle_ms: u64,
281 pub include: Option<globset::GlobSet>,
282 pub exclude: Option<globset::GlobSet>,
283 pub selector: Option<String>,
284 pub json: bool,
285 pub user_agent: Option<String>,
286 pub concurrency: usize,
288 pub delay: Option<Duration>,
290}
291
292pub(crate) struct CrawlPageResult {
294 pub url: String,
295 pub depth: usize,
296 pub status: CrawlStatus,
297 pub title: Option<String>,
298 pub content: Option<String>,
299 pub error: Option<String>,
300 pub links_found: usize,
301 pub fetched_at: SystemTime,
302}
303
304pub(crate) enum CrawlStatus {
306 Ok,
307 Error,
308}
309
310struct Frontier {
311 queue: VecDeque<(Url, usize)>,
312 visited: HashSet<String>,
313 content_hashes: HashSet<u64>,
314}
315
316impl Frontier {
317 fn new(seed: &Url) -> Self {
318 Self {
319 queue: VecDeque::from([(seed.clone(), 0)]),
320 visited: HashSet::from([normalize_url(seed)]),
321 content_hashes: HashSet::new(),
322 }
323 }
324
325 fn try_enqueue(&mut self, url: Url, depth: usize) -> bool {
326 if self.visited.insert(normalize_url(&url)) {
327 self.queue.push_back((url, depth));
328 true
329 } else {
330 false
331 }
332 }
333
334 fn pop(&mut self) -> Option<(Url, usize)> {
335 self.queue.pop_front()
336 }
337
338 fn is_duplicate_content(&mut self, content: &str) -> bool {
339 let mut h = std::hash::DefaultHasher::new();
340 content.hash(&mut h);
341 !self.content_hashes.insert(h.finish())
342 }
343
344 fn pending(&self) -> usize {
345 self.queue.len()
346 }
347}
348
349fn extract_links_from_html(html: &str, base: &Url) -> Vec<Url> {
350 dom_query::Document::from(html)
351 .select("a[href]")
352 .iter()
353 .filter_map(|el| {
354 let href = el.attr("href")?;
355 let href = href.trim();
356 if href.is_empty() {
357 return None;
358 }
359 let resolved = base.join(href).ok()?;
360 matches!(resolved.scheme(), "http" | "https").then_some(resolved)
361 })
362 .collect()
363}
364
365pub(crate) async fn run(
366 opts: CrawlPlan,
367 robots: RobotsPolicy,
368 fetcher: &(impl PageFetcher + Clone),
369 mut on_page: impl FnMut(&CrawlPageResult),
370) -> Vec<CrawlPageResult> {
371 let mut frontier = Frontier::new(&opts.seed);
372 let mut results = Vec::new();
373 let mut in_flight: JoinSet<FetchOutcome> = JoinSet::new();
374
375 let mut ticker = opts.delay.map(|period| {
377 let mut t = interval(period);
378 t.set_missed_tick_behavior(MissedTickBehavior::Delay);
379 t
380 });
381
382 let concurrency = opts.concurrency.max(1);
383
384 loop {
385 while in_flight.len() < concurrency && results.len() + in_flight.len() < opts.limit {
386 let Some((url, depth)) = frontier.pop() else {
387 break;
388 };
389 if let Some(t) = ticker.as_mut() {
390 t.tick().await;
391 }
392 spawn_fetch(&mut in_flight, fetcher, &opts, url, depth);
393 }
394
395 let outcome = match in_flight.join_next().await {
396 None => break,
397 Some(Ok(o)) => o,
398 Some(Err(e)) if e.is_panic() => {
399 tracing::error!(err = %e, "crawl fetch task panicked");
400 continue;
401 }
402 Some(Err(e)) => {
403 tracing::warn!(err = %e, "crawl fetch task cancelled");
404 continue;
405 }
406 };
407
408 let FetchOutcome {
409 url,
410 depth,
411 result,
412 fetched_at,
413 } = outcome;
414 let page = match result {
415 Ok(p) => p,
416 Err(msg) => {
417 let r = error_result(&url, depth, msg, fetched_at);
418 on_page(&r);
419 results.push(r);
420 continue;
421 }
422 };
423
424 let budget_used = results.len() + in_flight.len() + 1;
425 let mut ctx = CrawlContext {
426 frontier: &mut frontier,
427 robots: &robots,
428 opts: &opts,
429 };
430 if let Some(r) = process_ok_fetch(&mut ctx, &url, depth, &page, budget_used, fetched_at) {
431 on_page(&r);
432 results.push(r);
433 }
434 }
435
436 results
437}
438
439fn spawn_fetch(
440 in_flight: &mut JoinSet<FetchOutcome>,
441 fetcher: &(impl PageFetcher + Clone),
442 opts: &CrawlPlan,
443 url: Url,
444 depth: usize,
445) {
446 let url_str = url.to_string();
447 let timeout = opts.timeout_secs;
448 let settle = opts.settle_ms;
449 let user_agent = opts.user_agent.clone();
450 let f = fetcher.clone();
451 in_flight.spawn_blocking(move || {
452 let result = f
453 .fetch_page(bridge::FetchOptions {
454 url: &url_str,
455 timeout_secs: timeout,
456 settle_ms: settle,
457 mode: bridge::FetchMode::Content { include_a11y: false },
458 user_agent: user_agent.as_deref(),
459 })
460 .map_err(|e| format!("{e:#}"));
461 FetchOutcome {
462 url,
463 depth,
464 result,
465 fetched_at: SystemTime::now(),
466 }
467 });
468}
469
470struct CrawlContext<'a> {
472 frontier: &'a mut Frontier,
473 robots: &'a RobotsPolicy,
474 opts: &'a CrawlPlan,
475}
476
477fn process_ok_fetch(
479 ctx: &mut CrawlContext<'_>,
480 url: &Url,
481 depth: usize,
482 page: &bridge::ServoPage,
483 budget_used: usize,
484 fetched_at: SystemTime,
485) -> Option<CrawlPageResult> {
486 let html = if page.html.len() > MAX_HTML_BYTES {
487 &page.html[..crate::sanitize::floor_char_boundary(&page.html, MAX_HTML_BYTES)]
488 } else {
489 &page.html
490 };
491
492 let input = crate::extract::ExtractInput::new(html, url.as_str())
493 .with_layout_json(page.layout_json.as_deref())
494 .with_inner_text(page.inner_text.as_deref())
495 .with_selector(ctx.opts.selector.as_deref());
496
497 let content = if ctx.opts.json {
498 crate::extract::extract_json(&input).ok()
499 } else {
500 crate::extract::extract_text(&input).ok()
501 };
502
503 if content.as_ref().is_some_and(|c| ctx.frontier.is_duplicate_content(c)) {
504 return None;
505 }
506
507 let links = extract_links_from_html(html, url);
508 let links_found = links.len();
509
510 if depth < ctx.opts.max_depth {
511 for link in &links {
512 if budget_used + ctx.frontier.pending() >= ctx.opts.limit {
513 break;
514 }
515 if !is_same_site(&ctx.opts.seed, link)
516 || net::validate_url_with_policy(link.as_str(), bridge::engine_policy()).is_err()
517 || !ctx.robots.is_allowed(link)
518 || !matches_scope(link, ctx.opts.include.as_ref(), ctx.opts.exclude.as_ref())
519 {
520 continue;
521 }
522 ctx.frontier.try_enqueue(link.clone(), depth + 1);
523 }
524 }
525
526 let title = {
527 let doc = dom_query::Document::from(html);
528 let t = doc.select("title").text().to_string();
529 (!t.is_empty()).then_some(t)
530 };
531
532 Some(CrawlPageResult {
533 url: url.to_string(),
534 depth,
535 status: CrawlStatus::Ok,
536 title,
537 content: content.map(|c| crate::sanitize::sanitize(&c).into_owned()),
538 error: None,
539 links_found,
540 fetched_at,
541 })
542}
543
544struct FetchOutcome {
546 url: Url,
547 depth: usize,
548 result: Result<bridge::ServoPage, String>,
549 fetched_at: SystemTime,
550}
551
552fn error_result(url: &Url, depth: usize, error: String, fetched_at: SystemTime) -> CrawlPageResult {
553 CrawlPageResult {
554 url: url.to_string(),
555 depth,
556 status: CrawlStatus::Error,
557 title: None,
558 content: None,
559 error: Some(error),
560 links_found: 0,
561 fetched_at,
562 }
563}
564
565#[cfg(test)]
566mod tests {
567 use super::*;
568 use std::collections::HashMap;
569 use std::sync::Arc;
570
571 #[test]
572 fn crawl_options_defaults() {
573 let opts = CrawlOptions::new("https://example.com");
574 assert_eq!(opts.url, "https://example.com");
575 assert_eq!(opts.limit, 50);
576 assert_eq!(opts.max_depth, 3);
577 assert_eq!(opts.timeout, Duration::from_secs(30));
578 assert!(opts.include.is_empty());
579 assert!(opts.exclude.is_empty());
580 assert_eq!(opts.concurrency, 1);
581 assert_eq!(opts.delay, Some(Duration::from_millis(500)));
582 }
583
584 #[test]
585 fn crawl_options_chaining() {
586 let opts = CrawlOptions::new("https://example.com")
587 .limit(100)
588 .max_depth(5)
589 .timeout(Duration::from_secs(60))
590 .include(&["/docs/**"])
591 .exclude(&["/docs/archive/**"])
592 .concurrency(4)
593 .delay(None);
594 assert_eq!(opts.limit, 100);
595 assert_eq!(opts.max_depth, 5);
596 assert_eq!(opts.include, vec!["/docs/**"]);
597 assert_eq!(opts.exclude, vec!["/docs/archive/**"]);
598 assert_eq!(opts.concurrency, 4);
599 assert_eq!(opts.delay, None);
600 }
601
602 #[test]
603 fn crawl_options_concurrency_clamps_below_one() {
604 let opts = CrawlOptions::new("https://example.com").concurrency(0);
605 assert_eq!(opts.concurrency, 1);
606 }
607
608 #[test]
609 fn crawl_options_delay_custom_value() {
610 let opts = CrawlOptions::new("https://example.com").delay(Some(Duration::from_secs(2)));
611 assert_eq!(opts.delay, Some(Duration::from_secs(2)));
612 }
613
614 #[test]
615 fn crawl_user_agent_sanitizes_crlf() {
616 let opts = CrawlOptions::new("https://example.com").user_agent("Crawler\r\n/2.0");
617 assert_eq!(opts.user_agent.as_deref(), Some("Crawler /2.0"));
618 }
619
620 #[derive(Clone)]
621 struct MockFetcher(Arc<HashMap<String, String>>);
622
623 impl MockFetcher {
624 fn new(pages: &[(&str, &str)]) -> Self {
625 Self(Arc::new(
626 pages.iter().map(|(u, h)| (u.to_string(), h.to_string())).collect(),
627 ))
628 }
629 }
630
631 impl PageFetcher for MockFetcher {
632 fn fetch_page(&self, opts: bridge::FetchOptions<'_>) -> anyhow::Result<bridge::ServoPage> {
633 self.0
634 .get(opts.url)
635 .map(|html| bridge::ServoPage {
636 html: html.clone(),
637 ..Default::default()
638 })
639 .ok_or_else(|| anyhow::anyhow!("not found: {}", opts.url))
640 }
641 }
642
643 fn page(links: &[&str]) -> String {
644 use std::fmt::Write;
645 let mut anchors = String::new();
646 for l in links {
647 write!(anchors, r#"<a href="{l}">link</a>"#).unwrap();
648 }
649 format!("<html><head><title>Test</title></head><body>{anchors}</body></html>")
650 }
651
652 fn distinct_page(tag: &str) -> String {
654 format!("<html><head><title>{tag}</title></head><body>page {tag}</body></html>")
655 }
656
657 async fn check(
659 pages: &[(&str, &str)],
660 configure: impl FnOnce(&mut CrawlPlan),
661 assert: impl FnOnce(&[CrawlPageResult]),
662 ) {
663 let fetcher = MockFetcher::new(pages);
664 let seed = pages[0].0;
665 let mut opts = CrawlPlan {
666 seed: Url::parse(seed).unwrap(),
667 limit: 50,
668 max_depth: 3,
669 timeout_secs: 30,
670 settle_ms: 0,
671 include: None,
672 exclude: None,
673 selector: None,
674 json: false,
675 user_agent: None,
676 concurrency: 1,
677 delay: None,
678 };
679 configure(&mut opts);
680 let results = run(opts, RobotsPolicy::Unavailable, &fetcher, |_| {}).await;
681 assert(&results);
682 }
683
684 #[tokio::test]
685 async fn crawl_single_page() {
686 check(
687 &[("https://example.com/", &page(&[]))],
688 |_| {},
689 |r| {
690 assert_eq!(r.len(), 1);
691 assert_eq!(r[0].url, "https://example.com/");
692 },
693 )
694 .await;
695 }
696
697 #[tokio::test]
698 async fn crawl_follows_links() {
699 check(
700 &[
701 ("https://example.com/", &page(&["/a", "/b"])),
702 (
703 "https://example.com/a",
704 "<html><head><title>A</title></head><body>page a</body></html>",
705 ),
706 (
707 "https://example.com/b",
708 "<html><head><title>B</title></head><body>page b</body></html>",
709 ),
710 ],
711 |_| {},
712 |r| assert_eq!(r.len(), 3),
713 )
714 .await;
715 }
716
717 #[tokio::test]
718 async fn crawl_respects_depth_limit() {
719 check(
720 &[
721 ("https://example.com/", &page(&["/a"])),
722 ("https://example.com/a", &page(&["/b"])),
723 ("https://example.com/b", &page(&["/c"])),
724 ("https://example.com/c", &page(&[])),
725 ],
726 |o| o.max_depth = 1,
727 |r| assert_eq!(r.len(), 2),
728 )
729 .await;
730 }
731
732 #[tokio::test]
733 async fn crawl_respects_limit() {
734 check(
735 &[
736 ("https://example.com/", &page(&["/a", "/b", "/c"])),
737 ("https://example.com/a", &page(&[])),
738 ("https://example.com/b", &page(&[])),
739 ("https://example.com/c", &page(&[])),
740 ],
741 |o| o.limit = 2,
742 |r| assert_eq!(r.len(), 2),
743 )
744 .await;
745 }
746
747 #[tokio::test]
748 async fn crawl_skips_cross_site_links() {
749 check(
750 &[
751 ("https://example.com/", &page(&["https://other.com/x"])),
752 ("https://other.com/x", &page(&[])),
753 ],
754 |_| {},
755 |r| assert_eq!(r.len(), 1),
756 )
757 .await;
758 }
759
760 #[tokio::test]
761 async fn crawl_deduplicates_urls() {
762 check(
763 &[
764 ("https://example.com/", &page(&["/a", "/a", "/a"])),
765 ("https://example.com/a", &page(&["/"])),
766 ],
767 |_| {},
768 |r| assert_eq!(r.len(), 2),
769 )
770 .await;
771 }
772
773 #[tokio::test]
774 async fn crawl_handles_fetch_errors() {
775 check(
776 &[("https://example.com/", &page(&["/missing"]))],
777 |_| {},
778 |r| {
779 assert_eq!(r.len(), 2);
780 assert!(matches!(r[1].status, CrawlStatus::Error));
781 assert!(r[1].error.is_some());
782 },
783 )
784 .await;
785 }
786
787 #[tokio::test]
788 async fn crawl_applies_include_glob() {
789 check(
790 &[
791 ("https://example.com/", &page(&["/docs/a", "/blog/b"])),
792 ("https://example.com/docs/a", &page(&[])),
793 ("https://example.com/blog/b", &page(&[])),
794 ],
795 |o| o.include = Some(crate::scope::build_globset(&["/docs/**".into()]).unwrap()),
796 |r| {
797 assert_eq!(r.len(), 2);
798 assert!(r.iter().any(|p| p.url == "https://example.com/docs/a"));
799 assert!(!r.iter().any(|p| p.url == "https://example.com/blog/b"));
800 },
801 )
802 .await;
803 }
804
805 #[tokio::test]
806 async fn crawl_applies_exclude_glob() {
807 check(
808 &[
809 ("https://example.com/", &page(&["/public", "/secret/data"])),
810 ("https://example.com/public", &page(&[])),
811 ("https://example.com/secret/data", &page(&[])),
812 ],
813 |o| o.exclude = Some(crate::scope::build_globset(&["/secret/**".into()]).unwrap()),
814 |r| {
815 assert_eq!(r.len(), 2);
816 assert!(!r.iter().any(|p| p.url == "https://example.com/secret/data"));
817 },
818 )
819 .await;
820 }
821
822 #[tokio::test]
823 async fn crawl_deduplicates_content() {
824 let same = "<html><head><title>Same</title></head><body>identical</body></html>";
825 check(
826 &[
827 ("https://example.com/", &page(&["/a", "/b"])),
828 ("https://example.com/a", same),
829 ("https://example.com/b", same),
830 ],
831 |_| {},
832 |r| assert_eq!(r.len(), 2),
833 )
834 .await;
835 }
836
837 #[tokio::test]
838 async fn crawl_concurrency_visits_all_pages() {
839 check(
840 &[
841 ("https://example.com/", &page(&["/a", "/b", "/c", "/d"])),
842 ("https://example.com/a", &distinct_page("a")),
843 ("https://example.com/b", &distinct_page("b")),
844 ("https://example.com/c", &distinct_page("c")),
845 ("https://example.com/d", &distinct_page("d")),
846 ],
847 |o| o.concurrency = 4,
848 |r| {
849 assert_eq!(r.len(), 5);
850 let urls: HashSet<&str> = r.iter().map(|p| p.url.as_str()).collect();
851 for u in [
852 "https://example.com/",
853 "https://example.com/a",
854 "https://example.com/b",
855 "https://example.com/c",
856 "https://example.com/d",
857 ] {
858 assert!(urls.contains(u), "missing {u}");
859 }
860 },
861 )
862 .await;
863 }
864
865 #[tokio::test]
866 async fn crawl_concurrency_respects_limit() {
867 check(
868 &[
869 ("https://example.com/", &page(&["/a", "/b", "/c", "/d"])),
870 ("https://example.com/a", &distinct_page("a")),
871 ("https://example.com/b", &distinct_page("b")),
872 ("https://example.com/c", &distinct_page("c")),
873 ("https://example.com/d", &distinct_page("d")),
874 ],
875 |o| {
876 o.concurrency = 4;
877 o.limit = 3;
878 },
879 |r| assert_eq!(r.len(), 3),
880 )
881 .await;
882 }
883
884 #[tokio::test]
885 async fn crawl_concurrency_one_preserves_bfs_order() {
886 check(
887 &[
888 ("https://example.com/", &page(&["/a", "/b"])),
889 ("https://example.com/a", &distinct_page("a")),
890 ("https://example.com/b", &distinct_page("b")),
891 ],
892 |o| o.concurrency = 1,
893 |r| {
894 assert_eq!(r.len(), 3);
895 assert_eq!(r[0].url, "https://example.com/");
896 assert_eq!(r[1].url, "https://example.com/a");
897 assert_eq!(r[2].url, "https://example.com/b");
898 },
899 )
900 .await;
901 }
902
903 #[tokio::test(start_paused = true)]
904 async fn crawl_delay_enforces_minimum_interval() {
905 let start = tokio::time::Instant::now();
907 check(
908 &[
909 ("https://example.com/", &page(&["/a", "/b"])),
910 ("https://example.com/a", &distinct_page("a")),
911 ("https://example.com/b", &distinct_page("b")),
912 ],
913 |o| {
914 o.concurrency = 1;
915 o.delay = Some(Duration::from_millis(500));
916 },
917 |r| assert_eq!(r.len(), 3),
918 )
919 .await;
920 let elapsed = start.elapsed();
921 assert!(
922 elapsed >= Duration::from_secs(1),
923 "expected >= 1s for 3 pages with 500ms delay, got {elapsed:?}"
924 );
925 }
926
927 #[test]
928 fn frontier_dedup() {
929 let seed = Url::parse("https://example.com/").unwrap();
930 let mut f = Frontier::new(&seed);
931 assert!(!f.try_enqueue(seed, 0));
932 let other = Url::parse("https://example.com/page").unwrap();
933 assert!(f.try_enqueue(other.clone(), 1));
934 assert!(!f.try_enqueue(other, 1));
935 }
936
937 #[test]
938 fn frontier_pop_and_pending() {
939 let seed = Url::parse("https://example.com/").unwrap();
940 let mut f = Frontier::new(&seed);
941 assert_eq!(f.pending(), 1);
942 let (url, depth) = f.pop().unwrap();
943 assert_eq!(url.as_str(), "https://example.com/");
944 assert_eq!(depth, 0);
945 assert_eq!(f.pending(), 0);
946 assert!(f.pop().is_none());
947 }
948
949 #[test]
950 fn extract_links_filters_dangerous_schemes() {
951 let html = r#"<a href="https://example.com/a">A</a>
952 <a href="javascript:void(0)">JS</a>
953 <a href="JAVASCRIPT:alert(1)">JS upper</a>
954 <a href="data:text/html,<h1>hi</h1>">Data</a>
955 <a href="mailto:x@y.com">Mail</a>
956 <a href="/relative">Rel</a>"#;
957 let base = Url::parse("https://example.com/").unwrap();
958 let links = extract_links_from_html(html, &base);
959 assert_eq!(links.len(), 2);
960 assert_eq!(links[0].as_str(), "https://example.com/a");
961 assert_eq!(links[1].as_str(), "https://example.com/relative");
962 }
963
964 #[test]
965 fn error_result_fields() {
966 let url = Url::parse("https://example.com/fail").unwrap();
967 let r = error_result(&url, 2, "timeout".into(), SystemTime::now());
968 assert!(matches!(r.status, CrawlStatus::Error));
969 assert_eq!(r.error.as_deref(), Some("timeout"));
970 assert!(r.content.is_none());
971 }
972
973 #[test]
974 fn content_hash_dedup() {
975 let seed = Url::parse("https://example.com/").unwrap();
976 let mut f = Frontier::new(&seed);
977 assert!(!f.is_duplicate_content("unique content"));
978 assert!(f.is_duplicate_content("unique content"));
979 assert!(!f.is_duplicate_content("different content"));
980 }
981}