1use std::collections::{HashSet, VecDeque};
4use std::hash::{Hash, Hasher};
5use std::time::Duration;
6
7use tokio::task::JoinSet;
8use tokio::time::{MissedTickBehavior, interval};
9use url::Url;
10
11use crate::bridge::{self, PageFetcher};
12use crate::net;
13use crate::robots::RobotsPolicy;
14use crate::scope::{is_same_site, matches_scope, normalize_url};
15
16const MAX_HTML_BYTES: usize = 2 * 1024 * 1024;
17
18#[must_use = "options do nothing until passed to crawl() or crawl_each()"]
20#[derive(Debug, Clone)]
21pub struct CrawlOptions {
22 pub(crate) url: String,
23 pub(crate) limit: usize,
24 pub(crate) max_depth: usize,
25 pub(crate) timeout: Duration,
26 pub(crate) settle: Duration,
27 pub(crate) include: Vec<String>,
28 pub(crate) exclude: Vec<String>,
29 pub(crate) selector: Option<String>,
30 pub(crate) json: bool,
31 pub(crate) user_agent: Option<String>,
32 pub(crate) concurrency: usize,
33 pub(crate) delay: Option<Duration>,
34}
35
36impl CrawlOptions {
37 pub fn new(url: &str) -> Self {
39 Self {
40 url: url.into(),
41 limit: 50,
42 max_depth: 3,
43 timeout: Duration::from_secs(30),
44 settle: Duration::ZERO,
45 include: Vec::new(),
46 exclude: Vec::new(),
47 selector: None,
48 json: false,
49 user_agent: None,
50 concurrency: 1,
51 delay: Some(Duration::from_millis(500)),
52 }
53 }
54
55 pub fn limit(mut self, n: usize) -> Self {
57 self.limit = n;
58 self
59 }
60
61 pub fn max_depth(mut self, n: usize) -> Self {
63 self.max_depth = n;
64 self
65 }
66
67 pub fn timeout(mut self, timeout: Duration) -> Self {
69 self.timeout = timeout;
70 self
71 }
72
73 pub fn settle(mut self, settle: Duration) -> Self {
75 self.settle = settle;
76 self
77 }
78
79 pub fn include(mut self, patterns: &[&str]) -> Self {
81 self.include = patterns.iter().map(|s| (*s).to_string()).collect();
82 self
83 }
84
85 pub fn exclude(mut self, patterns: &[&str]) -> Self {
87 self.exclude = patterns.iter().map(|s| (*s).to_string()).collect();
88 self
89 }
90
91 pub fn json(mut self, json: bool) -> Self {
93 self.json = json;
94 self
95 }
96
97 pub fn selector(mut self, selector: impl Into<String>) -> Self {
99 self.selector = Some(selector.into());
100 self
101 }
102
103 pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
105 self.user_agent = Some(net::sanitize_user_agent(ua.into()));
106 self
107 }
108
109 pub fn concurrency(mut self, n: usize) -> Self {
112 self.concurrency = n.max(1);
113 self
114 }
115
116 pub fn delay(mut self, delay: Option<Duration>) -> Self {
118 self.delay = delay;
119 self
120 }
121}
122
123#[derive(Debug, Clone)]
125#[non_exhaustive]
126pub struct CrawlResult {
127 pub url: String,
129 pub depth: usize,
131 pub outcome: Result<CrawlPage, CrawlError>,
133}
134
135#[derive(Debug, Clone)]
137pub struct CrawlPage {
138 pub title: Option<String>,
140 pub content: String,
142 pub links_found: usize,
144}
145
146#[derive(Debug, Clone)]
148pub struct CrawlError {
149 pub message: String,
151}
152
153impl std::fmt::Display for CrawlError {
154 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
155 f.write_str(&self.message)
156 }
157}
158
159impl std::error::Error for CrawlError {}
160
161impl serde::Serialize for CrawlResult {
162 fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
163 use serde::ser::SerializeMap;
164 match &self.outcome {
165 Ok(page) => {
166 let mut map = serializer.serialize_map(None)?;
167 map.serialize_entry("url", &self.url)?;
168 map.serialize_entry("depth", &self.depth)?;
169 map.serialize_entry("status", "ok")?;
170 if let Some(t) = &page.title {
171 map.serialize_entry("title", t)?;
172 }
173 map.serialize_entry("content", &page.content)?;
174 map.serialize_entry("links_found", &page.links_found)?;
175 map.end()
176 }
177 Err(e) => {
178 let mut map = serializer.serialize_map(None)?;
179 map.serialize_entry("url", &self.url)?;
180 map.serialize_entry("depth", &self.depth)?;
181 map.serialize_entry("status", "error")?;
182 map.serialize_entry("error", &e.message)?;
183 map.end()
184 }
185 }
186 }
187}
188
189impl CrawlResult {
190 fn from_internal(r: &CrawlPageResult) -> Self {
191 let outcome = match r.status {
192 CrawlStatus::Ok => Ok(CrawlPage {
193 title: r.title.clone(),
194 content: r.content.clone().unwrap_or_default(),
195 links_found: r.links_found,
196 }),
197 CrawlStatus::Error => Err(CrawlError {
198 message: r.error.clone().unwrap_or_default(),
199 }),
200 };
201 Self {
202 url: r.url.clone(),
203 depth: r.depth,
204 outcome,
205 }
206 }
207}
208
209#[allow(clippy::needless_pass_by_value)]
211pub fn crawl_each(opts: CrawlOptions, mut on_page: impl FnMut(&CrawlResult)) -> crate::error::Result<()> {
212 net::ensure_crypto_provider();
213 let plan = build_crawl_plan(&opts)?;
214 crate::runtime::block_on(async {
215 let robots = tokio::task::spawn_blocking({
216 let seed = plan.seed.clone();
217 let user_agent = plan.user_agent.clone();
218 let timeout = Duration::from_secs(plan.timeout_secs);
219 move || crate::robots::RobotsRules::fetch(&seed, user_agent.as_deref(), timeout)
220 })
221 .await
222 .unwrap_or(RobotsPolicy::Unreachable);
223 run(plan, robots, &bridge::ServoFetcher, |r| {
224 on_page(&CrawlResult::from_internal(r));
225 })
226 .await
227 })
228 .map_err(|e| crate::error::Error::Engine(e.to_string()))?;
229 Ok(())
230}
231
232#[allow(clippy::needless_pass_by_value)]
234pub fn crawl(opts: CrawlOptions) -> crate::error::Result<Vec<CrawlResult>> {
235 let mut results = Vec::new();
236 crawl_each(opts, |r| results.push(r.clone()))?;
237 Ok(results)
238}
239
240fn build_crawl_plan(opts: &CrawlOptions) -> crate::error::Result<CrawlPlan> {
241 let seed = net::validate_url(&opts.url)?;
242 let include = if opts.include.is_empty() {
243 None
244 } else {
245 Some(crate::scope::build_globset(&opts.include)?)
246 };
247 let exclude = if opts.exclude.is_empty() {
248 None
249 } else {
250 Some(crate::scope::build_globset(&opts.exclude)?)
251 };
252 Ok(CrawlPlan {
253 seed,
254 limit: opts.limit,
255 max_depth: opts.max_depth,
256 timeout_secs: opts.timeout.as_secs().max(1),
257 settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
258 include,
259 exclude,
260 selector: opts.selector.clone(),
261 json: opts.json,
262 user_agent: opts.user_agent.clone(),
263 concurrency: opts.concurrency,
264 delay: opts.delay,
265 })
266}
267
268pub(crate) struct CrawlPlan {
270 pub seed: Url,
271 pub limit: usize,
272 pub max_depth: usize,
273 pub timeout_secs: u64,
274 pub settle_ms: u64,
275 pub include: Option<globset::GlobSet>,
276 pub exclude: Option<globset::GlobSet>,
277 pub selector: Option<String>,
278 pub json: bool,
279 pub user_agent: Option<String>,
280 pub concurrency: usize,
282 pub delay: Option<Duration>,
284}
285
286#[derive(serde::Serialize)]
288pub(crate) struct CrawlPageResult {
289 pub url: String,
290 pub depth: usize,
291 pub status: CrawlStatus,
292 #[serde(skip_serializing_if = "Option::is_none")]
293 pub title: Option<String>,
294 #[serde(skip_serializing_if = "Option::is_none")]
295 pub content: Option<String>,
296 #[serde(skip_serializing_if = "Option::is_none")]
297 pub error: Option<String>,
298 pub links_found: usize,
299}
300
301#[derive(serde::Serialize)]
303#[serde(rename_all = "lowercase")]
304pub(crate) enum CrawlStatus {
305 Ok,
306 Error,
307}
308
309struct Frontier {
310 queue: VecDeque<(Url, usize)>,
311 visited: HashSet<String>,
312 content_hashes: HashSet<u64>,
313}
314
315impl Frontier {
316 fn new(seed: &Url) -> Self {
317 Self {
318 queue: VecDeque::from([(seed.clone(), 0)]),
319 visited: HashSet::from([normalize_url(seed)]),
320 content_hashes: HashSet::new(),
321 }
322 }
323
324 fn try_enqueue(&mut self, url: Url, depth: usize) -> bool {
325 if self.visited.insert(normalize_url(&url)) {
326 self.queue.push_back((url, depth));
327 true
328 } else {
329 false
330 }
331 }
332
333 fn pop(&mut self) -> Option<(Url, usize)> {
334 self.queue.pop_front()
335 }
336
337 fn is_duplicate_content(&mut self, content: &str) -> bool {
338 let mut h = std::hash::DefaultHasher::new();
339 content.hash(&mut h);
340 !self.content_hashes.insert(h.finish())
341 }
342
343 fn pending(&self) -> usize {
344 self.queue.len()
345 }
346}
347
348fn extract_links_from_html(html: &str, base: &Url) -> Vec<Url> {
349 dom_query::Document::from(html)
350 .select("a[href]")
351 .iter()
352 .filter_map(|el| {
353 let href = el.attr("href")?;
354 let href = href.trim();
355 if href.is_empty() {
356 return None;
357 }
358 let resolved = base.join(href).ok()?;
359 matches!(resolved.scheme(), "http" | "https").then_some(resolved)
360 })
361 .collect()
362}
363
364pub(crate) async fn run(
365 opts: CrawlPlan,
366 robots: RobotsPolicy,
367 fetcher: &(impl PageFetcher + Clone),
368 mut on_page: impl FnMut(&CrawlPageResult),
369) -> Vec<CrawlPageResult> {
370 let mut frontier = Frontier::new(&opts.seed);
371 let mut results = Vec::new();
372 let mut in_flight: JoinSet<FetchOutcome> = JoinSet::new();
373
374 let mut ticker = opts.delay.map(|period| {
376 let mut t = interval(period);
377 t.set_missed_tick_behavior(MissedTickBehavior::Delay);
378 t
379 });
380
381 let concurrency = opts.concurrency.max(1);
382
383 loop {
384 while in_flight.len() < concurrency && results.len() + in_flight.len() < opts.limit {
385 let Some((url, depth)) = frontier.pop() else {
386 break;
387 };
388 if let Some(t) = ticker.as_mut() {
389 t.tick().await;
390 }
391 spawn_fetch(&mut in_flight, fetcher, &opts, url, depth);
392 }
393
394 let outcome = match in_flight.join_next().await {
395 None => break,
396 Some(Ok(o)) => o,
397 Some(Err(e)) if e.is_panic() => {
398 tracing::error!(err = %e, "crawl fetch task panicked");
399 continue;
400 }
401 Some(Err(e)) => {
402 tracing::warn!(err = %e, "crawl fetch task cancelled");
403 continue;
404 }
405 };
406
407 let FetchOutcome { url, depth, result } = outcome;
408 let page = match result {
409 Ok(p) => p,
410 Err(msg) => {
411 let r = error_result(&url, depth, msg);
412 on_page(&r);
413 results.push(r);
414 continue;
415 }
416 };
417
418 let budget_used = results.len() + in_flight.len() + 1;
420 if let Some(r) = process_ok_fetch(&url, depth, &page, &mut frontier, &robots, &opts, budget_used) {
421 on_page(&r);
422 results.push(r);
423 }
424 }
425
426 results
427}
428
429fn spawn_fetch(
430 in_flight: &mut JoinSet<FetchOutcome>,
431 fetcher: &(impl PageFetcher + Clone),
432 opts: &CrawlPlan,
433 url: Url,
434 depth: usize,
435) {
436 let url_str = url.to_string();
437 let timeout = opts.timeout_secs;
438 let settle = opts.settle_ms;
439 let user_agent = opts.user_agent.clone();
440 let f = fetcher.clone();
441 in_flight.spawn_blocking(move || FetchOutcome {
442 url,
443 depth,
444 result: f
445 .fetch_page(bridge::FetchOptions {
446 url: &url_str,
447 timeout_secs: timeout,
448 settle_ms: settle,
449 mode: bridge::FetchMode::Content { include_a11y: false },
450 user_agent: user_agent.as_deref(),
451 })
452 .map_err(|e| format!("{e:#}")),
453 });
454}
455
456fn process_ok_fetch(
458 url: &Url,
459 depth: usize,
460 page: &bridge::ServoPage,
461 frontier: &mut Frontier,
462 robots: &RobotsPolicy,
463 opts: &CrawlPlan,
464 budget_used: usize,
465) -> Option<CrawlPageResult> {
466 let html = if page.html.len() > MAX_HTML_BYTES {
467 &page.html[..crate::sanitize::floor_char_boundary(&page.html, MAX_HTML_BYTES)]
468 } else {
469 &page.html
470 };
471
472 let input = crate::extract::ExtractInput::new(html, url.as_str())
473 .with_layout_json(page.layout_json.as_deref())
474 .with_inner_text(page.inner_text.as_deref())
475 .with_selector(opts.selector.as_deref());
476
477 let content = if opts.json {
478 crate::extract::extract_json(&input).ok()
479 } else {
480 crate::extract::extract_text(&input).ok()
481 };
482
483 if content.as_ref().is_some_and(|c| frontier.is_duplicate_content(c)) {
484 return None;
485 }
486
487 let links = extract_links_from_html(html, url);
488 let links_found = links.len();
489
490 if depth < opts.max_depth {
491 for link in &links {
492 if budget_used + frontier.pending() >= opts.limit {
493 break;
494 }
495 if !is_same_site(&opts.seed, link)
496 || net::validate_url_with_policy(link.as_str(), bridge::engine_policy()).is_err()
497 || !robots.is_allowed(link)
498 || !matches_scope(link, opts.include.as_ref(), opts.exclude.as_ref())
499 {
500 continue;
501 }
502 frontier.try_enqueue(link.clone(), depth + 1);
503 }
504 }
505
506 let title = {
507 let doc = dom_query::Document::from(html);
508 let t = doc.select("title").text().to_string();
509 (!t.is_empty()).then_some(t)
510 };
511
512 Some(CrawlPageResult {
513 url: url.to_string(),
514 depth,
515 status: CrawlStatus::Ok,
516 title,
517 content: content.map(|c| crate::sanitize::sanitize(&c).into_owned()),
518 error: None,
519 links_found,
520 })
521}
522
523struct FetchOutcome {
525 url: Url,
526 depth: usize,
527 result: Result<bridge::ServoPage, String>,
528}
529
530fn error_result(url: &Url, depth: usize, error: String) -> CrawlPageResult {
531 CrawlPageResult {
532 url: url.to_string(),
533 depth,
534 status: CrawlStatus::Error,
535 title: None,
536 content: None,
537 error: Some(error),
538 links_found: 0,
539 }
540}
541
542#[cfg(test)]
543mod tests {
544 use super::*;
545 use std::collections::HashMap;
546 use std::sync::Arc;
547
548 #[test]
549 fn crawl_options_defaults() {
550 let opts = CrawlOptions::new("https://example.com");
551 assert_eq!(opts.url, "https://example.com");
552 assert_eq!(opts.limit, 50);
553 assert_eq!(opts.max_depth, 3);
554 assert_eq!(opts.timeout, Duration::from_secs(30));
555 assert!(opts.include.is_empty());
556 assert!(opts.exclude.is_empty());
557 assert_eq!(opts.concurrency, 1);
558 assert_eq!(opts.delay, Some(Duration::from_millis(500)));
559 }
560
561 #[test]
562 fn crawl_options_chaining() {
563 let opts = CrawlOptions::new("https://example.com")
564 .limit(100)
565 .max_depth(5)
566 .timeout(Duration::from_secs(60))
567 .include(&["/docs/**"])
568 .exclude(&["/docs/archive/**"])
569 .concurrency(4)
570 .delay(None);
571 assert_eq!(opts.limit, 100);
572 assert_eq!(opts.max_depth, 5);
573 assert_eq!(opts.include, vec!["/docs/**"]);
574 assert_eq!(opts.exclude, vec!["/docs/archive/**"]);
575 assert_eq!(opts.concurrency, 4);
576 assert_eq!(opts.delay, None);
577 }
578
579 #[test]
580 fn crawl_options_concurrency_clamps_below_one() {
581 let opts = CrawlOptions::new("https://example.com").concurrency(0);
582 assert_eq!(opts.concurrency, 1);
583 }
584
585 #[test]
586 fn crawl_options_delay_custom_value() {
587 let opts = CrawlOptions::new("https://example.com").delay(Some(Duration::from_secs(2)));
588 assert_eq!(opts.delay, Some(Duration::from_secs(2)));
589 }
590
591 #[test]
592 fn crawl_user_agent_sanitizes_crlf() {
593 let opts = CrawlOptions::new("https://example.com").user_agent("Crawler\r\n/2.0");
594 assert_eq!(opts.user_agent.as_deref(), Some("Crawler /2.0"));
595 }
596
597 #[derive(Clone)]
598 struct MockFetcher(Arc<HashMap<String, String>>);
599
600 impl MockFetcher {
601 fn new(pages: &[(&str, &str)]) -> Self {
602 Self(Arc::new(
603 pages.iter().map(|(u, h)| (u.to_string(), h.to_string())).collect(),
604 ))
605 }
606 }
607
608 impl PageFetcher for MockFetcher {
609 fn fetch_page(&self, opts: bridge::FetchOptions<'_>) -> anyhow::Result<bridge::ServoPage> {
610 self.0
611 .get(opts.url)
612 .map(|html| bridge::ServoPage {
613 html: html.clone(),
614 ..Default::default()
615 })
616 .ok_or_else(|| anyhow::anyhow!("not found: {}", opts.url))
617 }
618 }
619
620 fn page(links: &[&str]) -> String {
621 use std::fmt::Write;
622 let mut anchors = String::new();
623 for l in links {
624 write!(anchors, r#"<a href="{l}">link</a>"#).unwrap();
625 }
626 format!("<html><head><title>Test</title></head><body>{anchors}</body></html>")
627 }
628
629 fn distinct_page(tag: &str) -> String {
631 format!("<html><head><title>{tag}</title></head><body>page {tag}</body></html>")
632 }
633
634 async fn check(
636 pages: &[(&str, &str)],
637 configure: impl FnOnce(&mut CrawlPlan),
638 assert: impl FnOnce(&[CrawlPageResult]),
639 ) {
640 let fetcher = MockFetcher::new(pages);
641 let seed = pages[0].0;
642 let mut opts = CrawlPlan {
643 seed: Url::parse(seed).unwrap(),
644 limit: 50,
645 max_depth: 3,
646 timeout_secs: 30,
647 settle_ms: 0,
648 include: None,
649 exclude: None,
650 selector: None,
651 json: false,
652 user_agent: None,
653 concurrency: 1,
654 delay: None,
655 };
656 configure(&mut opts);
657 let results = run(opts, RobotsPolicy::Unavailable, &fetcher, |_| {}).await;
658 assert(&results);
659 }
660
661 #[tokio::test]
662 async fn crawl_single_page() {
663 check(
664 &[("https://example.com/", &page(&[]))],
665 |_| {},
666 |r| {
667 assert_eq!(r.len(), 1);
668 assert_eq!(r[0].url, "https://example.com/");
669 },
670 )
671 .await;
672 }
673
674 #[tokio::test]
675 async fn crawl_follows_links() {
676 check(
677 &[
678 ("https://example.com/", &page(&["/a", "/b"])),
679 (
680 "https://example.com/a",
681 "<html><head><title>A</title></head><body>page a</body></html>",
682 ),
683 (
684 "https://example.com/b",
685 "<html><head><title>B</title></head><body>page b</body></html>",
686 ),
687 ],
688 |_| {},
689 |r| assert_eq!(r.len(), 3),
690 )
691 .await;
692 }
693
694 #[tokio::test]
695 async fn crawl_respects_depth_limit() {
696 check(
697 &[
698 ("https://example.com/", &page(&["/a"])),
699 ("https://example.com/a", &page(&["/b"])),
700 ("https://example.com/b", &page(&["/c"])),
701 ("https://example.com/c", &page(&[])),
702 ],
703 |o| o.max_depth = 1,
704 |r| assert_eq!(r.len(), 2),
705 )
706 .await;
707 }
708
709 #[tokio::test]
710 async fn crawl_respects_limit() {
711 check(
712 &[
713 ("https://example.com/", &page(&["/a", "/b", "/c"])),
714 ("https://example.com/a", &page(&[])),
715 ("https://example.com/b", &page(&[])),
716 ("https://example.com/c", &page(&[])),
717 ],
718 |o| o.limit = 2,
719 |r| assert_eq!(r.len(), 2),
720 )
721 .await;
722 }
723
724 #[tokio::test]
725 async fn crawl_skips_cross_site_links() {
726 check(
727 &[
728 ("https://example.com/", &page(&["https://other.com/x"])),
729 ("https://other.com/x", &page(&[])),
730 ],
731 |_| {},
732 |r| assert_eq!(r.len(), 1),
733 )
734 .await;
735 }
736
737 #[tokio::test]
738 async fn crawl_deduplicates_urls() {
739 check(
740 &[
741 ("https://example.com/", &page(&["/a", "/a", "/a"])),
742 ("https://example.com/a", &page(&["/"])),
743 ],
744 |_| {},
745 |r| assert_eq!(r.len(), 2),
746 )
747 .await;
748 }
749
750 #[tokio::test]
751 async fn crawl_handles_fetch_errors() {
752 check(
753 &[("https://example.com/", &page(&["/missing"]))],
754 |_| {},
755 |r| {
756 assert_eq!(r.len(), 2);
757 assert!(matches!(r[1].status, CrawlStatus::Error));
758 assert!(r[1].error.is_some());
759 },
760 )
761 .await;
762 }
763
764 #[tokio::test]
765 async fn crawl_applies_include_glob() {
766 check(
767 &[
768 ("https://example.com/", &page(&["/docs/a", "/blog/b"])),
769 ("https://example.com/docs/a", &page(&[])),
770 ("https://example.com/blog/b", &page(&[])),
771 ],
772 |o| o.include = Some(crate::scope::build_globset(&["/docs/**".into()]).unwrap()),
773 |r| {
774 assert_eq!(r.len(), 2);
775 assert!(r.iter().any(|p| p.url == "https://example.com/docs/a"));
776 assert!(!r.iter().any(|p| p.url == "https://example.com/blog/b"));
777 },
778 )
779 .await;
780 }
781
782 #[tokio::test]
783 async fn crawl_applies_exclude_glob() {
784 check(
785 &[
786 ("https://example.com/", &page(&["/public", "/secret/data"])),
787 ("https://example.com/public", &page(&[])),
788 ("https://example.com/secret/data", &page(&[])),
789 ],
790 |o| o.exclude = Some(crate::scope::build_globset(&["/secret/**".into()]).unwrap()),
791 |r| {
792 assert_eq!(r.len(), 2);
793 assert!(!r.iter().any(|p| p.url == "https://example.com/secret/data"));
794 },
795 )
796 .await;
797 }
798
799 #[tokio::test]
800 async fn crawl_deduplicates_content() {
801 let same = "<html><head><title>Same</title></head><body>identical</body></html>";
802 check(
803 &[
804 ("https://example.com/", &page(&["/a", "/b"])),
805 ("https://example.com/a", same),
806 ("https://example.com/b", same),
807 ],
808 |_| {},
809 |r| assert_eq!(r.len(), 2),
810 )
811 .await;
812 }
813
814 #[tokio::test]
815 async fn crawl_concurrency_visits_all_pages() {
816 check(
817 &[
818 ("https://example.com/", &page(&["/a", "/b", "/c", "/d"])),
819 ("https://example.com/a", &distinct_page("a")),
820 ("https://example.com/b", &distinct_page("b")),
821 ("https://example.com/c", &distinct_page("c")),
822 ("https://example.com/d", &distinct_page("d")),
823 ],
824 |o| o.concurrency = 4,
825 |r| {
826 assert_eq!(r.len(), 5);
827 let urls: HashSet<&str> = r.iter().map(|p| p.url.as_str()).collect();
828 for u in [
829 "https://example.com/",
830 "https://example.com/a",
831 "https://example.com/b",
832 "https://example.com/c",
833 "https://example.com/d",
834 ] {
835 assert!(urls.contains(u), "missing {u}");
836 }
837 },
838 )
839 .await;
840 }
841
842 #[tokio::test]
843 async fn crawl_concurrency_respects_limit() {
844 check(
845 &[
846 ("https://example.com/", &page(&["/a", "/b", "/c", "/d"])),
847 ("https://example.com/a", &distinct_page("a")),
848 ("https://example.com/b", &distinct_page("b")),
849 ("https://example.com/c", &distinct_page("c")),
850 ("https://example.com/d", &distinct_page("d")),
851 ],
852 |o| {
853 o.concurrency = 4;
854 o.limit = 3;
855 },
856 |r| assert_eq!(r.len(), 3),
857 )
858 .await;
859 }
860
861 #[tokio::test]
862 async fn crawl_concurrency_one_preserves_bfs_order() {
863 check(
864 &[
865 ("https://example.com/", &page(&["/a", "/b"])),
866 ("https://example.com/a", &distinct_page("a")),
867 ("https://example.com/b", &distinct_page("b")),
868 ],
869 |o| o.concurrency = 1,
870 |r| {
871 assert_eq!(r.len(), 3);
872 assert_eq!(r[0].url, "https://example.com/");
873 assert_eq!(r[1].url, "https://example.com/a");
874 assert_eq!(r[2].url, "https://example.com/b");
875 },
876 )
877 .await;
878 }
879
880 #[tokio::test(start_paused = true)]
881 async fn crawl_delay_enforces_minimum_interval() {
882 let start = tokio::time::Instant::now();
884 check(
885 &[
886 ("https://example.com/", &page(&["/a", "/b"])),
887 ("https://example.com/a", &distinct_page("a")),
888 ("https://example.com/b", &distinct_page("b")),
889 ],
890 |o| {
891 o.concurrency = 1;
892 o.delay = Some(Duration::from_millis(500));
893 },
894 |r| assert_eq!(r.len(), 3),
895 )
896 .await;
897 let elapsed = start.elapsed();
898 assert!(
899 elapsed >= Duration::from_secs(1),
900 "expected >= 1s for 3 pages with 500ms delay, got {elapsed:?}"
901 );
902 }
903
904 #[test]
905 fn frontier_dedup() {
906 let seed = Url::parse("https://example.com/").unwrap();
907 let mut f = Frontier::new(&seed);
908 assert!(!f.try_enqueue(seed, 0));
909 let other = Url::parse("https://example.com/page").unwrap();
910 assert!(f.try_enqueue(other.clone(), 1));
911 assert!(!f.try_enqueue(other, 1));
912 }
913
914 #[test]
915 fn frontier_pop_and_pending() {
916 let seed = Url::parse("https://example.com/").unwrap();
917 let mut f = Frontier::new(&seed);
918 assert_eq!(f.pending(), 1);
919 let (url, depth) = f.pop().unwrap();
920 assert_eq!(url.as_str(), "https://example.com/");
921 assert_eq!(depth, 0);
922 assert_eq!(f.pending(), 0);
923 assert!(f.pop().is_none());
924 }
925
926 #[test]
927 fn extract_links_filters_dangerous_schemes() {
928 let html = r#"<a href="https://example.com/a">A</a>
929 <a href="javascript:void(0)">JS</a>
930 <a href="JAVASCRIPT:alert(1)">JS upper</a>
931 <a href="data:text/html,<h1>hi</h1>">Data</a>
932 <a href="mailto:x@y.com">Mail</a>
933 <a href="/relative">Rel</a>"#;
934 let base = Url::parse("https://example.com/").unwrap();
935 let links = extract_links_from_html(html, &base);
936 assert_eq!(links.len(), 2);
937 assert_eq!(links[0].as_str(), "https://example.com/a");
938 assert_eq!(links[1].as_str(), "https://example.com/relative");
939 }
940
941 #[test]
942 fn error_result_fields() {
943 let url = Url::parse("https://example.com/fail").unwrap();
944 let r = error_result(&url, 2, "timeout".into());
945 assert!(matches!(r.status, CrawlStatus::Error));
946 assert_eq!(r.error.as_deref(), Some("timeout"));
947 assert!(r.content.is_none());
948 }
949
950 #[test]
951 fn content_hash_dedup() {
952 let seed = Url::parse("https://example.com/").unwrap();
953 let mut f = Frontier::new(&seed);
954 assert!(!f.is_duplicate_content("unique content"));
955 assert!(f.is_duplicate_content("unique content"));
956 assert!(!f.is_duplicate_content("different content"));
957 }
958}