1use std::collections::{HashSet, VecDeque};
4use std::fmt;
5use std::hash::{DefaultHasher, Hash, Hasher};
6use std::time::{Duration, SystemTime};
7
8use tokio::task::{JoinSet, spawn_blocking};
9use tokio::time::{MissedTickBehavior, interval};
10use url::Url;
11
12use crate::bridge::{self, PageFetcher};
13use crate::net;
14use crate::robots::RobotsPolicy;
15use crate::scope::{is_same_site, matches_scope, normalize_url};
16
17const MAX_HTML_BYTES: usize = 2 * 1024 * 1024;
18
19#[must_use = "options do nothing until passed to crawl() or crawl_each()"]
21#[derive(Debug, Clone)]
22pub struct CrawlOptions {
23 pub(crate) url: String,
24 pub(crate) limit: usize,
25 pub(crate) max_depth: usize,
26 pub(crate) timeout: Duration,
27 pub(crate) settle: Duration,
28 pub(crate) include: Vec<String>,
29 pub(crate) exclude: Vec<String>,
30 pub(crate) selector: Option<String>,
31 pub(crate) json: bool,
32 pub(crate) user_agent: Option<String>,
33 pub(crate) concurrency: usize,
34 pub(crate) delay: Option<Duration>,
35}
36
37impl CrawlOptions {
38 pub fn new(url: &str) -> Self {
40 Self {
41 url: url.into(),
42 limit: 50,
43 max_depth: 3,
44 timeout: Duration::from_secs(30),
45 settle: Duration::ZERO,
46 include: Vec::new(),
47 exclude: Vec::new(),
48 selector: None,
49 json: false,
50 user_agent: None,
51 concurrency: 1,
52 delay: Some(Duration::from_millis(500)),
53 }
54 }
55
56 pub fn limit(mut self, n: usize) -> Self {
58 self.limit = n;
59 self
60 }
61
62 pub fn max_depth(mut self, n: usize) -> Self {
64 self.max_depth = n;
65 self
66 }
67
68 pub fn timeout(mut self, timeout: Duration) -> Self {
70 self.timeout = timeout;
71 self
72 }
73
74 pub fn settle(mut self, settle: Duration) -> Self {
76 self.settle = settle;
77 self
78 }
79
80 pub fn include(mut self, patterns: &[&str]) -> Self {
82 self.include = patterns.iter().map(|s| (*s).to_string()).collect();
83 self
84 }
85
86 pub fn exclude(mut self, patterns: &[&str]) -> Self {
88 self.exclude = patterns.iter().map(|s| (*s).to_string()).collect();
89 self
90 }
91
92 pub fn json(mut self, json: bool) -> Self {
94 self.json = json;
95 self
96 }
97
98 pub fn selector(mut self, selector: impl Into<String>) -> Self {
100 self.selector = Some(selector.into());
101 self
102 }
103
104 pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
106 self.user_agent = Some(net::sanitize_user_agent(ua.into()));
107 self
108 }
109
110 pub fn concurrency(mut self, n: usize) -> Self {
113 self.concurrency = n.max(1);
114 self
115 }
116
117 pub fn delay(mut self, delay: Option<Duration>) -> Self {
119 self.delay = delay;
120 self
121 }
122}
123
124#[derive(Debug, Clone)]
126#[non_exhaustive]
127pub struct CrawlResult {
128 pub url: String,
130 pub depth: usize,
132 pub fetched_at: SystemTime,
134 pub outcome: Result<CrawlPage, CrawlError>,
136}
137
138#[derive(Debug, Clone)]
140pub struct CrawlPage {
141 pub title: Option<String>,
143 pub content: String,
145 pub links_found: usize,
147}
148
149#[derive(Debug, Clone)]
151pub struct CrawlError {
152 pub message: String,
154}
155
156impl fmt::Display for CrawlError {
157 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
158 f.write_str(&self.message)
159 }
160}
161
162impl std::error::Error for CrawlError {}
163
164impl serde::Serialize for CrawlResult {
165 fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
166 use serde::ser::SerializeMap;
167 let fetched_at = humantime::format_rfc3339_millis(self.fetched_at).to_string();
168 match &self.outcome {
169 Ok(page) => {
170 let mut map = serializer.serialize_map(None)?;
171 map.serialize_entry("type", "page")?;
172 map.serialize_entry("url", &self.url)?;
173 map.serialize_entry("depth", &self.depth)?;
174 map.serialize_entry("fetched_at", &fetched_at)?;
175 if let Some(t) = &page.title {
176 map.serialize_entry("title", t)?;
177 }
178 map.serialize_entry("content", &page.content)?;
179 map.serialize_entry("links_found", &page.links_found)?;
180 map.end()
181 }
182 Err(e) => {
183 let mut map = serializer.serialize_map(None)?;
184 map.serialize_entry("type", "error")?;
185 map.serialize_entry("url", &self.url)?;
186 map.serialize_entry("depth", &self.depth)?;
187 map.serialize_entry("fetched_at", &fetched_at)?;
188 map.serialize_entry("error", &e.message)?;
189 map.end()
190 }
191 }
192 }
193}
194
195impl CrawlResult {
196 fn from_internal(r: &CrawlPageResult) -> Self {
197 let outcome = match r.status {
198 CrawlStatus::Ok => Ok(CrawlPage {
199 title: r.title.clone(),
200 content: r.content.clone().unwrap_or_default(),
201 links_found: r.links_found,
202 }),
203 CrawlStatus::Error => Err(CrawlError {
204 message: r.error.clone().unwrap_or_default(),
205 }),
206 };
207 Self {
208 url: r.url.clone(),
209 depth: r.depth,
210 fetched_at: r.fetched_at,
211 outcome,
212 }
213 }
214}
215
216#[allow(clippy::needless_pass_by_value)]
218pub fn crawl_each(opts: CrawlOptions, mut on_page: impl FnMut(&CrawlResult)) -> crate::error::Result<()> {
219 net::ensure_crypto_provider();
220 let plan = build_crawl_plan(&opts)?;
221 crate::runtime::block_on(async {
222 let robots = spawn_blocking({
223 let seed = plan.seed.clone();
224 let user_agent = plan.user_agent.clone();
225 let timeout = Duration::from_secs(plan.timeout_secs);
226 move || crate::robots::RobotsRules::fetch(&seed, user_agent.as_deref(), timeout)
227 })
228 .await
229 .unwrap_or(RobotsPolicy::Unreachable);
230 run(plan, robots, &bridge::ServoFetcher, |r| {
231 on_page(&CrawlResult::from_internal(r));
232 })
233 .await
234 })
235 .map_err(|e| crate::error::Error::Engine(e.to_string()))?;
236 Ok(())
237}
238
239#[allow(clippy::needless_pass_by_value)]
241pub fn crawl(opts: CrawlOptions) -> crate::error::Result<Vec<CrawlResult>> {
242 let mut results = Vec::new();
243 crawl_each(opts, |r| results.push(r.clone()))?;
244 Ok(results)
245}
246
247fn build_crawl_plan(opts: &CrawlOptions) -> crate::error::Result<CrawlPlan> {
248 let seed = net::validate_url(&opts.url)?;
249 let include = if opts.include.is_empty() {
250 None
251 } else {
252 Some(crate::scope::build_globset(&opts.include)?)
253 };
254 let exclude = if opts.exclude.is_empty() {
255 None
256 } else {
257 Some(crate::scope::build_globset(&opts.exclude)?)
258 };
259 Ok(CrawlPlan {
260 seed,
261 limit: opts.limit,
262 max_depth: opts.max_depth,
263 timeout_secs: opts.timeout.as_secs().max(1),
264 settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
265 include,
266 exclude,
267 selector: opts.selector.clone(),
268 json: opts.json,
269 user_agent: opts.user_agent.clone(),
270 concurrency: opts.concurrency,
271 delay: opts.delay,
272 })
273}
274
275pub(crate) struct CrawlPlan {
277 pub seed: Url,
278 pub limit: usize,
279 pub max_depth: usize,
280 pub timeout_secs: u64,
281 pub settle_ms: u64,
282 pub include: Option<globset::GlobSet>,
283 pub exclude: Option<globset::GlobSet>,
284 pub selector: Option<String>,
285 pub json: bool,
286 pub user_agent: Option<String>,
287 pub concurrency: usize,
289 pub delay: Option<Duration>,
291}
292
293pub(crate) struct CrawlPageResult {
295 pub url: String,
296 pub depth: usize,
297 pub status: CrawlStatus,
298 pub title: Option<String>,
299 pub content: Option<String>,
300 pub error: Option<String>,
301 pub links_found: usize,
302 pub fetched_at: SystemTime,
303}
304
305pub(crate) enum CrawlStatus {
307 Ok,
308 Error,
309}
310
311struct Frontier {
312 queue: VecDeque<(Url, usize)>,
313 visited: HashSet<String>,
314 content_hashes: HashSet<u64>,
315}
316
317impl Frontier {
318 fn new(seed: &Url) -> Self {
319 Self {
320 queue: VecDeque::from([(seed.clone(), 0)]),
321 visited: HashSet::from([normalize_url(seed)]),
322 content_hashes: HashSet::new(),
323 }
324 }
325
326 fn try_enqueue(&mut self, url: Url, depth: usize) -> bool {
327 if self.visited.insert(normalize_url(&url)) {
328 self.queue.push_back((url, depth));
329 true
330 } else {
331 false
332 }
333 }
334
335 fn pop(&mut self) -> Option<(Url, usize)> {
336 self.queue.pop_front()
337 }
338
339 fn is_duplicate_content(&mut self, content: &str) -> bool {
340 let mut h = DefaultHasher::new();
341 content.hash(&mut h);
342 !self.content_hashes.insert(h.finish())
343 }
344
345 fn pending(&self) -> usize {
346 self.queue.len()
347 }
348}
349
350fn extract_links_from_html(html: &str, base: &Url) -> Vec<Url> {
351 dom_query::Document::from(html)
352 .select("a[href]")
353 .iter()
354 .filter_map(|el| {
355 let href = el.attr("href")?;
356 let href = href.trim();
357 if href.is_empty() {
358 return None;
359 }
360 let resolved = base.join(href).ok()?;
361 matches!(resolved.scheme(), "http" | "https").then_some(resolved)
362 })
363 .collect()
364}
365
366pub(crate) async fn run(
367 opts: CrawlPlan,
368 robots: RobotsPolicy,
369 fetcher: &(impl PageFetcher + Clone),
370 mut on_page: impl FnMut(&CrawlPageResult),
371) -> Vec<CrawlPageResult> {
372 let mut frontier = Frontier::new(&opts.seed);
373 let mut results = Vec::new();
374 let mut in_flight: JoinSet<FetchOutcome> = JoinSet::new();
375
376 let mut ticker = opts.delay.map(|period| {
378 let mut t = interval(period);
379 t.set_missed_tick_behavior(MissedTickBehavior::Delay);
380 t
381 });
382
383 let concurrency = opts.concurrency.max(1);
384
385 loop {
386 while in_flight.len() < concurrency && results.len() + in_flight.len() < opts.limit {
387 let Some((url, depth)) = frontier.pop() else {
388 break;
389 };
390 if let Some(t) = ticker.as_mut() {
391 t.tick().await;
392 }
393 spawn_fetch(&mut in_flight, fetcher, &opts, url, depth);
394 }
395
396 let outcome = match in_flight.join_next().await {
397 None => break,
398 Some(Ok(o)) => o,
399 Some(Err(e)) if e.is_panic() => {
400 tracing::error!(err = %e, "crawl fetch task panicked");
401 continue;
402 }
403 Some(Err(e)) => {
404 tracing::warn!(err = %e, "crawl fetch task cancelled");
405 continue;
406 }
407 };
408
409 let FetchOutcome {
410 url,
411 depth,
412 result,
413 fetched_at,
414 } = outcome;
415 let page = match result {
416 Ok(p) => p,
417 Err(msg) => {
418 let r = error_result(&url, depth, msg, fetched_at);
419 on_page(&r);
420 results.push(r);
421 continue;
422 }
423 };
424
425 let budget_used = results.len() + in_flight.len() + 1;
426 let mut ctx = CrawlContext {
427 frontier: &mut frontier,
428 robots: &robots,
429 opts: &opts,
430 };
431 if let Some(r) = process_ok_fetch(&mut ctx, &url, depth, &page, budget_used, fetched_at) {
432 on_page(&r);
433 results.push(r);
434 }
435 }
436
437 results
438}
439
440fn spawn_fetch(
441 in_flight: &mut JoinSet<FetchOutcome>,
442 fetcher: &(impl PageFetcher + Clone),
443 opts: &CrawlPlan,
444 url: Url,
445 depth: usize,
446) {
447 let url_str = url.to_string();
448 let timeout = opts.timeout_secs;
449 let settle = opts.settle_ms;
450 let user_agent = opts.user_agent.clone();
451 let f = fetcher.clone();
452 in_flight.spawn_blocking(move || {
453 let result = f
454 .fetch_page(bridge::FetchOptions {
455 url: &url_str,
456 timeout_secs: timeout,
457 settle_ms: settle,
458 mode: bridge::FetchMode::Content { include_a11y: false },
459 user_agent: user_agent.as_deref(),
460 })
461 .map_err(|e| format!("{e:#}"));
462 FetchOutcome {
463 url,
464 depth,
465 result,
466 fetched_at: SystemTime::now(),
467 }
468 });
469}
470
471struct CrawlContext<'a> {
473 frontier: &'a mut Frontier,
474 robots: &'a RobotsPolicy,
475 opts: &'a CrawlPlan,
476}
477
478fn process_ok_fetch(
480 ctx: &mut CrawlContext<'_>,
481 url: &Url,
482 depth: usize,
483 page: &bridge::ServoPage,
484 budget_used: usize,
485 fetched_at: SystemTime,
486) -> Option<CrawlPageResult> {
487 let html = if page.html.len() > MAX_HTML_BYTES {
488 &page.html[..crate::sanitize::floor_char_boundary(&page.html, MAX_HTML_BYTES)]
489 } else {
490 &page.html
491 };
492
493 let input = crate::extract::ExtractInput::new(html, url.as_str())
494 .with_layout_json(page.layout_json.as_deref())
495 .with_inner_text(page.inner_text.as_deref())
496 .with_selector(ctx.opts.selector.as_deref());
497
498 let content = if ctx.opts.json {
499 crate::extract::extract_json(&input).ok()
500 } else {
501 crate::extract::extract_text(&input).ok()
502 };
503
504 if content.as_ref().is_some_and(|c| ctx.frontier.is_duplicate_content(c)) {
505 return None;
506 }
507
508 let links = extract_links_from_html(html, url);
509 let links_found = links.len();
510
511 if depth < ctx.opts.max_depth {
512 for link in &links {
513 if budget_used + ctx.frontier.pending() >= ctx.opts.limit {
514 break;
515 }
516 if !is_same_site(&ctx.opts.seed, link)
517 || net::validate_url_with_policy(link.as_str(), bridge::engine_policy()).is_err()
518 || !ctx.robots.is_allowed(link)
519 || !matches_scope(link, ctx.opts.include.as_ref(), ctx.opts.exclude.as_ref())
520 {
521 continue;
522 }
523 ctx.frontier.try_enqueue(link.clone(), depth + 1);
524 }
525 }
526
527 let title = {
528 let doc = dom_query::Document::from(html);
529 let t = doc.select("title").text().to_string();
530 (!t.is_empty()).then_some(t)
531 };
532
533 Some(CrawlPageResult {
534 url: url.to_string(),
535 depth,
536 status: CrawlStatus::Ok,
537 title,
538 content: content.map(|c| crate::sanitize::sanitize(&c).into_owned()),
539 error: None,
540 links_found,
541 fetched_at,
542 })
543}
544
545struct FetchOutcome {
547 url: Url,
548 depth: usize,
549 result: Result<bridge::ServoPage, String>,
550 fetched_at: SystemTime,
551}
552
553fn error_result(url: &Url, depth: usize, error: String, fetched_at: SystemTime) -> CrawlPageResult {
554 CrawlPageResult {
555 url: url.to_string(),
556 depth,
557 status: CrawlStatus::Error,
558 title: None,
559 content: None,
560 error: Some(error),
561 links_found: 0,
562 fetched_at,
563 }
564}
565
566#[cfg(test)]
567mod tests {
568 use std::collections::HashMap;
569 use std::sync::Arc;
570
571 use super::*;
572
573 #[test]
574 fn crawl_options_defaults() {
575 let opts = CrawlOptions::new("https://example.com");
576 assert_eq!(opts.url, "https://example.com");
577 assert_eq!(opts.limit, 50);
578 assert_eq!(opts.max_depth, 3);
579 assert_eq!(opts.timeout, Duration::from_secs(30));
580 assert!(opts.include.is_empty());
581 assert!(opts.exclude.is_empty());
582 assert_eq!(opts.concurrency, 1);
583 assert_eq!(opts.delay, Some(Duration::from_millis(500)));
584 }
585
586 #[test]
587 fn crawl_options_chaining() {
588 let opts = CrawlOptions::new("https://example.com")
589 .limit(100)
590 .max_depth(5)
591 .timeout(Duration::from_secs(60))
592 .include(&["/docs/**"])
593 .exclude(&["/docs/archive/**"])
594 .concurrency(4)
595 .delay(None);
596 assert_eq!(opts.limit, 100);
597 assert_eq!(opts.max_depth, 5);
598 assert_eq!(opts.include, vec!["/docs/**"]);
599 assert_eq!(opts.exclude, vec!["/docs/archive/**"]);
600 assert_eq!(opts.concurrency, 4);
601 assert_eq!(opts.delay, None);
602 }
603
604 #[test]
605 fn crawl_options_concurrency_clamps_below_one() {
606 let opts = CrawlOptions::new("https://example.com").concurrency(0);
607 assert_eq!(opts.concurrency, 1);
608 }
609
610 #[test]
611 fn crawl_options_delay_custom_value() {
612 let opts = CrawlOptions::new("https://example.com").delay(Some(Duration::from_secs(2)));
613 assert_eq!(opts.delay, Some(Duration::from_secs(2)));
614 }
615
616 #[test]
617 fn crawl_user_agent_sanitizes_crlf() {
618 let opts = CrawlOptions::new("https://example.com").user_agent("Crawler\r\n/2.0");
619 assert_eq!(opts.user_agent.as_deref(), Some("Crawler /2.0"));
620 }
621
622 #[derive(Clone)]
623 struct MockFetcher(Arc<HashMap<String, String>>);
624
625 impl MockFetcher {
626 fn new(pages: &[(&str, &str)]) -> Self {
627 Self(Arc::new(
628 pages.iter().map(|(u, h)| (u.to_string(), h.to_string())).collect(),
629 ))
630 }
631 }
632
633 impl PageFetcher for MockFetcher {
634 fn fetch_page(&self, opts: bridge::FetchOptions<'_>) -> anyhow::Result<bridge::ServoPage> {
635 self.0
636 .get(opts.url)
637 .map(|html| bridge::ServoPage {
638 html: html.clone(),
639 ..Default::default()
640 })
641 .ok_or_else(|| anyhow::anyhow!("not found: {}", opts.url))
642 }
643 }
644
645 fn page(links: &[&str]) -> String {
646 use std::fmt::Write as _;
647 let mut anchors = String::new();
648 for l in links {
649 write!(anchors, r#"<a href="{l}">link</a>"#).unwrap();
650 }
651 format!("<html><head><title>Test</title></head><body>{anchors}</body></html>")
652 }
653
654 fn distinct_page(tag: &str) -> String {
656 format!("<html><head><title>{tag}</title></head><body>page {tag}</body></html>")
657 }
658
659 async fn check(
661 pages: &[(&str, &str)],
662 configure: impl FnOnce(&mut CrawlPlan),
663 assert: impl FnOnce(&[CrawlPageResult]),
664 ) {
665 let fetcher = MockFetcher::new(pages);
666 let seed = pages[0].0;
667 let mut opts = CrawlPlan {
668 seed: Url::parse(seed).unwrap(),
669 limit: 50,
670 max_depth: 3,
671 timeout_secs: 30,
672 settle_ms: 0,
673 include: None,
674 exclude: None,
675 selector: None,
676 json: false,
677 user_agent: None,
678 concurrency: 1,
679 delay: None,
680 };
681 configure(&mut opts);
682 let results = run(opts, RobotsPolicy::Unavailable, &fetcher, |_| {}).await;
683 assert(&results);
684 }
685
686 #[tokio::test]
687 async fn crawl_single_page() {
688 check(
689 &[("https://example.com/", &page(&[]))],
690 |_| {},
691 |r| {
692 assert_eq!(r.len(), 1);
693 assert_eq!(r[0].url, "https://example.com/");
694 },
695 )
696 .await;
697 }
698
699 #[tokio::test]
700 async fn crawl_follows_links() {
701 check(
702 &[
703 ("https://example.com/", &page(&["/a", "/b"])),
704 (
705 "https://example.com/a",
706 "<html><head><title>A</title></head><body>page a</body></html>",
707 ),
708 (
709 "https://example.com/b",
710 "<html><head><title>B</title></head><body>page b</body></html>",
711 ),
712 ],
713 |_| {},
714 |r| assert_eq!(r.len(), 3),
715 )
716 .await;
717 }
718
719 #[tokio::test]
720 async fn crawl_respects_depth_limit() {
721 check(
722 &[
723 ("https://example.com/", &page(&["/a"])),
724 ("https://example.com/a", &page(&["/b"])),
725 ("https://example.com/b", &page(&["/c"])),
726 ("https://example.com/c", &page(&[])),
727 ],
728 |o| o.max_depth = 1,
729 |r| assert_eq!(r.len(), 2),
730 )
731 .await;
732 }
733
734 #[tokio::test]
735 async fn crawl_respects_limit() {
736 check(
737 &[
738 ("https://example.com/", &page(&["/a", "/b", "/c"])),
739 ("https://example.com/a", &page(&[])),
740 ("https://example.com/b", &page(&[])),
741 ("https://example.com/c", &page(&[])),
742 ],
743 |o| o.limit = 2,
744 |r| assert_eq!(r.len(), 2),
745 )
746 .await;
747 }
748
749 #[tokio::test]
750 async fn crawl_skips_cross_site_links() {
751 check(
752 &[
753 ("https://example.com/", &page(&["https://other.com/x"])),
754 ("https://other.com/x", &page(&[])),
755 ],
756 |_| {},
757 |r| assert_eq!(r.len(), 1),
758 )
759 .await;
760 }
761
762 #[tokio::test]
763 async fn crawl_deduplicates_urls() {
764 check(
765 &[
766 ("https://example.com/", &page(&["/a", "/a", "/a"])),
767 ("https://example.com/a", &page(&["/"])),
768 ],
769 |_| {},
770 |r| assert_eq!(r.len(), 2),
771 )
772 .await;
773 }
774
775 #[tokio::test]
776 async fn crawl_handles_fetch_errors() {
777 check(
778 &[("https://example.com/", &page(&["/missing"]))],
779 |_| {},
780 |r| {
781 assert_eq!(r.len(), 2);
782 assert!(matches!(r[1].status, CrawlStatus::Error));
783 assert!(r[1].error.is_some());
784 },
785 )
786 .await;
787 }
788
789 #[tokio::test]
790 async fn crawl_applies_include_glob() {
791 check(
792 &[
793 ("https://example.com/", &page(&["/docs/a", "/blog/b"])),
794 ("https://example.com/docs/a", &page(&[])),
795 ("https://example.com/blog/b", &page(&[])),
796 ],
797 |o| o.include = Some(crate::scope::build_globset(&["/docs/**".into()]).unwrap()),
798 |r| {
799 assert_eq!(r.len(), 2);
800 assert!(r.iter().any(|p| p.url == "https://example.com/docs/a"));
801 assert!(!r.iter().any(|p| p.url == "https://example.com/blog/b"));
802 },
803 )
804 .await;
805 }
806
807 #[tokio::test]
808 async fn crawl_applies_exclude_glob() {
809 check(
810 &[
811 ("https://example.com/", &page(&["/public", "/secret/data"])),
812 ("https://example.com/public", &page(&[])),
813 ("https://example.com/secret/data", &page(&[])),
814 ],
815 |o| o.exclude = Some(crate::scope::build_globset(&["/secret/**".into()]).unwrap()),
816 |r| {
817 assert_eq!(r.len(), 2);
818 assert!(!r.iter().any(|p| p.url == "https://example.com/secret/data"));
819 },
820 )
821 .await;
822 }
823
824 #[tokio::test]
825 async fn crawl_deduplicates_content() {
826 let same = "<html><head><title>Same</title></head><body>identical</body></html>";
827 check(
828 &[
829 ("https://example.com/", &page(&["/a", "/b"])),
830 ("https://example.com/a", same),
831 ("https://example.com/b", same),
832 ],
833 |_| {},
834 |r| assert_eq!(r.len(), 2),
835 )
836 .await;
837 }
838
839 #[tokio::test]
840 async fn crawl_concurrency_visits_all_pages() {
841 check(
842 &[
843 ("https://example.com/", &page(&["/a", "/b", "/c", "/d"])),
844 ("https://example.com/a", &distinct_page("a")),
845 ("https://example.com/b", &distinct_page("b")),
846 ("https://example.com/c", &distinct_page("c")),
847 ("https://example.com/d", &distinct_page("d")),
848 ],
849 |o| o.concurrency = 4,
850 |r| {
851 assert_eq!(r.len(), 5);
852 let urls: HashSet<&str> = r.iter().map(|p| p.url.as_str()).collect();
853 for u in [
854 "https://example.com/",
855 "https://example.com/a",
856 "https://example.com/b",
857 "https://example.com/c",
858 "https://example.com/d",
859 ] {
860 assert!(urls.contains(u), "missing {u}");
861 }
862 },
863 )
864 .await;
865 }
866
867 #[tokio::test]
868 async fn crawl_concurrency_respects_limit() {
869 check(
870 &[
871 ("https://example.com/", &page(&["/a", "/b", "/c", "/d"])),
872 ("https://example.com/a", &distinct_page("a")),
873 ("https://example.com/b", &distinct_page("b")),
874 ("https://example.com/c", &distinct_page("c")),
875 ("https://example.com/d", &distinct_page("d")),
876 ],
877 |o| {
878 o.concurrency = 4;
879 o.limit = 3;
880 },
881 |r| assert_eq!(r.len(), 3),
882 )
883 .await;
884 }
885
886 #[tokio::test]
887 async fn crawl_concurrency_one_preserves_bfs_order() {
888 check(
889 &[
890 ("https://example.com/", &page(&["/a", "/b"])),
891 ("https://example.com/a", &distinct_page("a")),
892 ("https://example.com/b", &distinct_page("b")),
893 ],
894 |o| o.concurrency = 1,
895 |r| {
896 assert_eq!(r.len(), 3);
897 assert_eq!(r[0].url, "https://example.com/");
898 assert_eq!(r[1].url, "https://example.com/a");
899 assert_eq!(r[2].url, "https://example.com/b");
900 },
901 )
902 .await;
903 }
904
905 #[tokio::test(start_paused = true)]
906 async fn crawl_delay_enforces_minimum_interval() {
907 let start = tokio::time::Instant::now();
909 check(
910 &[
911 ("https://example.com/", &page(&["/a", "/b"])),
912 ("https://example.com/a", &distinct_page("a")),
913 ("https://example.com/b", &distinct_page("b")),
914 ],
915 |o| {
916 o.concurrency = 1;
917 o.delay = Some(Duration::from_millis(500));
918 },
919 |r| assert_eq!(r.len(), 3),
920 )
921 .await;
922 let elapsed = start.elapsed();
923 assert!(
924 elapsed >= Duration::from_secs(1),
925 "expected >= 1s for 3 pages with 500ms delay, got {elapsed:?}"
926 );
927 }
928
929 #[test]
930 fn frontier_dedup() {
931 let seed = Url::parse("https://example.com/").unwrap();
932 let mut f = Frontier::new(&seed);
933 assert!(!f.try_enqueue(seed, 0));
934 let other = Url::parse("https://example.com/page").unwrap();
935 assert!(f.try_enqueue(other.clone(), 1));
936 assert!(!f.try_enqueue(other, 1));
937 }
938
939 #[test]
940 fn frontier_pop_and_pending() {
941 let seed = Url::parse("https://example.com/").unwrap();
942 let mut f = Frontier::new(&seed);
943 assert_eq!(f.pending(), 1);
944 let (url, depth) = f.pop().unwrap();
945 assert_eq!(url.as_str(), "https://example.com/");
946 assert_eq!(depth, 0);
947 assert_eq!(f.pending(), 0);
948 assert!(f.pop().is_none());
949 }
950
951 #[test]
952 fn extract_links_filters_dangerous_schemes() {
953 let html = r#"<a href="https://example.com/a">A</a>
954 <a href="javascript:void(0)">JS</a>
955 <a href="JAVASCRIPT:alert(1)">JS upper</a>
956 <a href="data:text/html,<h1>hi</h1>">Data</a>
957 <a href="mailto:x@y.com">Mail</a>
958 <a href="/relative">Rel</a>"#;
959 let base = Url::parse("https://example.com/").unwrap();
960 let links = extract_links_from_html(html, &base);
961 assert_eq!(links.len(), 2);
962 assert_eq!(links[0].as_str(), "https://example.com/a");
963 assert_eq!(links[1].as_str(), "https://example.com/relative");
964 }
965
966 #[test]
967 fn error_result_fields() {
968 let url = Url::parse("https://example.com/fail").unwrap();
969 let r = error_result(&url, 2, "timeout".into(), SystemTime::now());
970 assert!(matches!(r.status, CrawlStatus::Error));
971 assert_eq!(r.error.as_deref(), Some("timeout"));
972 assert!(r.content.is_none());
973 }
974
975 #[test]
976 fn content_hash_dedup() {
977 let seed = Url::parse("https://example.com/").unwrap();
978 let mut f = Frontier::new(&seed);
979 assert!(!f.is_duplicate_content("unique content"));
980 assert!(f.is_duplicate_content("unique content"));
981 assert!(!f.is_duplicate_content("different content"));
982 }
983}