1use std::collections::{HashSet, VecDeque};
4use std::hash::{DefaultHasher, Hash, Hasher};
5use std::time::{Duration, SystemTime};
6
7use tokio::task::{JoinSet, spawn_blocking};
8use tokio::time::{MissedTickBehavior, interval};
9use url::Url;
10
11use crate::bridge::{self, PageFetcher};
12use crate::net;
13use crate::robots::RobotsPolicy;
14use crate::scope::{is_same_site, matches_scope, normalize_url};
15
16const MAX_HTML_BYTES: usize = 2 * 1024 * 1024;
17
18#[must_use = "options do nothing until passed to crawl() or crawl_each()"]
20#[derive(Debug, Clone)]
21pub struct CrawlOptions {
22 pub(crate) url: String,
23 pub(crate) limit: usize,
24 pub(crate) max_depth: usize,
25 pub(crate) timeout: Duration,
26 pub(crate) settle: Duration,
27 pub(crate) include: Vec<String>,
28 pub(crate) exclude: Vec<String>,
29 pub(crate) selector: Option<String>,
30 pub(crate) json: bool,
31 pub(crate) user_agent: Option<String>,
32 pub(crate) concurrency: usize,
33 pub(crate) delay: Option<Duration>,
34}
35
36impl CrawlOptions {
37 pub fn new(url: &str) -> Self {
39 Self {
40 url: url.into(),
41 limit: 50,
42 max_depth: 3,
43 timeout: Duration::from_secs(30),
44 settle: Duration::ZERO,
45 include: Vec::new(),
46 exclude: Vec::new(),
47 selector: None,
48 json: false,
49 user_agent: None,
50 concurrency: 1,
51 delay: Some(Duration::from_millis(500)),
52 }
53 }
54
55 pub fn limit(mut self, n: usize) -> Self {
57 self.limit = n;
58 self
59 }
60
61 pub fn max_depth(mut self, n: usize) -> Self {
63 self.max_depth = n;
64 self
65 }
66
67 pub fn timeout(mut self, timeout: Duration) -> Self {
69 self.timeout = timeout;
70 self
71 }
72
73 pub fn settle(mut self, settle: Duration) -> Self {
75 self.settle = settle;
76 self
77 }
78
79 pub fn include(mut self, patterns: &[&str]) -> Self {
81 self.include = patterns.iter().map(|s| (*s).to_string()).collect();
82 self
83 }
84
85 pub fn exclude(mut self, patterns: &[&str]) -> Self {
87 self.exclude = patterns.iter().map(|s| (*s).to_string()).collect();
88 self
89 }
90
91 pub fn json(mut self, json: bool) -> Self {
93 self.json = json;
94 self
95 }
96
97 pub fn selector(mut self, selector: impl Into<String>) -> Self {
99 self.selector = Some(selector.into());
100 self
101 }
102
103 pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
105 self.user_agent = Some(net::sanitize_user_agent(ua.into()));
106 self
107 }
108
109 pub fn concurrency(mut self, n: usize) -> Self {
112 self.concurrency = n.max(1);
113 self
114 }
115
116 pub fn delay(mut self, delay: Option<Duration>) -> Self {
118 self.delay = delay;
119 self
120 }
121}
122
123#[derive(Debug)]
125#[non_exhaustive]
126pub struct CrawlResult {
127 pub url: String,
129 pub depth: usize,
131 pub fetched_at: SystemTime,
133 pub outcome: Result<CrawlPage, crate::error::Error>,
135}
136
137#[derive(Debug, Clone)]
139pub struct CrawlPage {
140 pub title: Option<String>,
142 pub content: String,
144 pub links_found: usize,
146}
147
148impl serde::Serialize for CrawlResult {
149 fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
150 use serde::ser::SerializeMap;
151 let fetched_at = humantime::format_rfc3339_millis(self.fetched_at).to_string();
152 match &self.outcome {
153 Ok(page) => {
154 let mut map = serializer.serialize_map(None)?;
155 map.serialize_entry("type", "page")?;
156 map.serialize_entry("url", &self.url)?;
157 map.serialize_entry("depth", &self.depth)?;
158 map.serialize_entry("fetched_at", &fetched_at)?;
159 if let Some(t) = &page.title {
160 map.serialize_entry("title", t)?;
161 }
162 map.serialize_entry("content", &page.content)?;
163 map.serialize_entry("links_found", &page.links_found)?;
164 map.end()
165 }
166 Err(e) => {
167 let mut map = serializer.serialize_map(None)?;
168 map.serialize_entry("type", "error")?;
169 map.serialize_entry("url", &self.url)?;
170 map.serialize_entry("depth", &self.depth)?;
171 map.serialize_entry("fetched_at", &fetched_at)?;
172 map.serialize_entry("error", &e.to_string())?;
173 map.end()
174 }
175 }
176 }
177}
178
179impl CrawlResult {
180 fn from_internal(r: CrawlPageResult) -> Self {
181 let outcome = match r.status {
182 CrawlStatus::Ok => Ok(CrawlPage {
183 title: r.title,
184 content: r.content.unwrap_or_default(),
185 links_found: r.links_found,
186 }),
187 CrawlStatus::Error => Err(r
188 .error
189 .unwrap_or_else(|| crate::error::Error::engine("unknown crawl error", None))),
190 };
191 Self {
192 url: r.url,
193 depth: r.depth,
194 fetched_at: r.fetched_at,
195 outcome,
196 }
197 }
198}
199
200pub fn crawl_each_blocking<F>(opts: &CrawlOptions, on_page: F) -> crate::error::Result<()>
202where
203 F: FnMut(CrawlResult) + Send,
204{
205 crate::runtime::block_on(crawl_each(opts, on_page)).map_err(|e| crate::error::Error::engine(e, None))?
206}
207
208pub async fn crawl_each<F>(opts: &CrawlOptions, mut on_page: F) -> crate::error::Result<()>
210where
211 F: FnMut(CrawlResult) + Send,
212{
213 net::ensure_crypto_provider();
214 let plan = build_crawl_plan(opts)?;
215 let robots = spawn_blocking({
216 let seed = plan.seed.clone();
217 let user_agent = plan.user_agent.clone();
218 let timeout = Duration::from_secs(plan.timeout_secs);
219 move || crate::robots::RobotsRules::fetch(&seed, user_agent.as_deref(), timeout)
220 })
221 .await
222 .unwrap_or(RobotsPolicy::Unreachable);
223 run(plan, robots, &bridge::ServoFetcher, |r| {
224 on_page(CrawlResult::from_internal(r));
225 })
226 .await;
227 Ok(())
228}
229
230pub fn crawl_blocking(opts: &CrawlOptions) -> crate::error::Result<Vec<CrawlResult>> {
232 let mut results = Vec::new();
233 crawl_each_blocking(opts, |r| results.push(r))?;
234 Ok(results)
235}
236
237pub async fn crawl(opts: &CrawlOptions) -> crate::error::Result<Vec<CrawlResult>> {
239 let mut results = Vec::new();
240 crawl_each(opts, |r| results.push(r)).await?;
241 Ok(results)
242}
243
244fn build_crawl_plan(opts: &CrawlOptions) -> crate::error::Result<CrawlPlan> {
245 let seed = net::validate_url(&opts.url)?;
246 let include = if opts.include.is_empty() {
247 None
248 } else {
249 Some(crate::scope::build_globset(&opts.include)?)
250 };
251 let exclude = if opts.exclude.is_empty() {
252 None
253 } else {
254 Some(crate::scope::build_globset(&opts.exclude)?)
255 };
256 Ok(CrawlPlan {
257 seed,
258 limit: opts.limit,
259 max_depth: opts.max_depth,
260 timeout_secs: opts.timeout.as_secs().max(1),
261 settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
262 include,
263 exclude,
264 selector: opts.selector.clone(),
265 json: opts.json,
266 user_agent: opts.user_agent.clone(),
267 concurrency: opts.concurrency,
268 delay: opts.delay,
269 })
270}
271
272pub(crate) struct CrawlPlan {
274 pub seed: Url,
275 pub limit: usize,
276 pub max_depth: usize,
277 pub timeout_secs: u64,
278 pub settle_ms: u64,
279 pub include: Option<globset::GlobSet>,
280 pub exclude: Option<globset::GlobSet>,
281 pub selector: Option<String>,
282 pub json: bool,
283 pub user_agent: Option<String>,
284 pub concurrency: usize,
286 pub delay: Option<Duration>,
288}
289
290pub(crate) struct CrawlPageResult {
292 pub url: String,
293 pub depth: usize,
294 pub status: CrawlStatus,
295 pub title: Option<String>,
296 pub content: Option<String>,
297 pub error: Option<crate::error::Error>,
298 pub links_found: usize,
299 pub fetched_at: SystemTime,
300}
301
302pub(crate) enum CrawlStatus {
304 Ok,
305 Error,
306}
307
308struct Frontier {
309 queue: VecDeque<(Url, usize)>,
310 visited: HashSet<String>,
311 content_hashes: HashSet<u64>,
312}
313
314impl Frontier {
315 fn new(seed: &Url) -> Self {
316 Self {
317 queue: VecDeque::from([(seed.clone(), 0)]),
318 visited: HashSet::from([normalize_url(seed)]),
319 content_hashes: HashSet::new(),
320 }
321 }
322
323 fn try_enqueue(&mut self, url: Url, depth: usize) -> bool {
324 if self.visited.insert(normalize_url(&url)) {
325 self.queue.push_back((url, depth));
326 true
327 } else {
328 false
329 }
330 }
331
332 fn pop(&mut self) -> Option<(Url, usize)> {
333 self.queue.pop_front()
334 }
335
336 fn is_duplicate_content(&mut self, content: &str) -> bool {
337 let mut h = DefaultHasher::new();
338 content.hash(&mut h);
339 !self.content_hashes.insert(h.finish())
340 }
341
342 fn pending(&self) -> usize {
343 self.queue.len()
344 }
345}
346
347fn extract_links_from_html(html: &str, base: &Url) -> Vec<Url> {
348 dom_query::Document::from(html)
349 .select("a[href]")
350 .iter()
351 .filter_map(|el| {
352 let href = el.attr("href")?;
353 let href = href.trim();
354 if href.is_empty() {
355 return None;
356 }
357 let resolved = base.join(href).ok()?;
358 matches!(resolved.scheme(), "http" | "https").then_some(resolved)
359 })
360 .collect()
361}
362
363pub(crate) async fn run(
364 opts: CrawlPlan,
365 robots: RobotsPolicy,
366 fetcher: &(impl PageFetcher + Clone),
367 mut on_page: impl FnMut(CrawlPageResult),
368) {
369 let mut frontier = Frontier::new(&opts.seed);
370 let mut completed: usize = 0;
371 let mut in_flight: JoinSet<FetchOutcome> = JoinSet::new();
372
373 let mut ticker = opts.delay.map(|period| {
375 let mut t = interval(period);
376 t.set_missed_tick_behavior(MissedTickBehavior::Delay);
377 t
378 });
379
380 let concurrency = opts.concurrency.max(1);
381
382 loop {
383 while in_flight.len() < concurrency && completed + in_flight.len() < opts.limit {
384 let Some((url, depth)) = frontier.pop() else {
385 break;
386 };
387 if let Some(t) = ticker.as_mut() {
388 t.tick().await;
389 }
390 spawn_fetch(&mut in_flight, fetcher, &opts, url, depth);
391 }
392
393 let outcome = match in_flight.join_next().await {
394 None => break,
395 Some(Ok(o)) => o,
396 Some(Err(e)) if e.is_panic() => {
397 tracing::error!(err = %e, "crawl fetch task panicked");
398 continue;
399 }
400 Some(Err(e)) => {
401 tracing::warn!(err = %e, "crawl fetch task cancelled");
402 continue;
403 }
404 };
405
406 let FetchOutcome {
407 url,
408 depth,
409 result,
410 fetched_at,
411 } = outcome;
412 let page = match result {
413 Ok(p) => p,
414 Err(err) => {
415 on_page(error_result(&url, depth, err, fetched_at));
416 completed += 1;
417 continue;
418 }
419 };
420
421 let budget_used = completed + in_flight.len() + 1;
422 let mut ctx = CrawlContext {
423 frontier: &mut frontier,
424 robots: &robots,
425 opts: &opts,
426 };
427 if let Some(r) = process_ok_fetch(&mut ctx, &url, depth, &page, budget_used, fetched_at) {
428 on_page(r);
429 completed += 1;
430 }
431 }
432}
433
434fn spawn_fetch(
435 in_flight: &mut JoinSet<FetchOutcome>,
436 fetcher: &(impl PageFetcher + Clone),
437 opts: &CrawlPlan,
438 url: Url,
439 depth: usize,
440) {
441 let url_str = url.to_string();
442 let timeout = opts.timeout_secs;
443 let settle = opts.settle_ms;
444 let user_agent = opts.user_agent.clone();
445 let f = fetcher.clone();
446 in_flight.spawn_blocking(move || {
447 let result = f
448 .fetch_page(bridge::FetchOptions {
449 url: &url_str,
450 timeout_secs: timeout,
451 settle_ms: settle,
452 mode: bridge::FetchMode::Content { include_a11y: false },
453 user_agent: user_agent.as_deref(),
454 })
455 .map_err(|e| crate::error::Error::engine(e, Some(url_str.clone())));
456 FetchOutcome {
457 url,
458 depth,
459 result,
460 fetched_at: SystemTime::now(),
461 }
462 });
463}
464
465struct CrawlContext<'a> {
467 frontier: &'a mut Frontier,
468 robots: &'a RobotsPolicy,
469 opts: &'a CrawlPlan,
470}
471
472fn process_ok_fetch(
474 ctx: &mut CrawlContext<'_>,
475 url: &Url,
476 depth: usize,
477 page: &bridge::ServoPage,
478 budget_used: usize,
479 fetched_at: SystemTime,
480) -> Option<CrawlPageResult> {
481 let html = if page.html.len() > MAX_HTML_BYTES {
482 &page.html[..crate::sanitize::floor_char_boundary(&page.html, MAX_HTML_BYTES)]
483 } else {
484 &page.html
485 };
486
487 let input = crate::extract::ExtractInput::new(html, url.as_str())
488 .with_layout_json(page.layout_json.as_deref())
489 .with_inner_text(page.inner_text.as_deref())
490 .with_selector(ctx.opts.selector.as_deref());
491
492 let content = if ctx.opts.json {
493 crate::extract::extract_json(&input).ok()
494 } else {
495 crate::extract::extract_text(&input).ok()
496 };
497
498 if content.as_ref().is_some_and(|c| ctx.frontier.is_duplicate_content(c)) {
499 return None;
500 }
501
502 let links = extract_links_from_html(html, url);
503 let links_found = links.len();
504
505 if depth < ctx.opts.max_depth {
506 for link in &links {
507 if budget_used + ctx.frontier.pending() >= ctx.opts.limit {
508 break;
509 }
510 if !is_same_site(&ctx.opts.seed, link)
511 || net::validate_url_with_policy(link.as_str(), bridge::engine_policy()).is_err()
512 || !ctx.robots.is_allowed(link)
513 || !matches_scope(link, ctx.opts.include.as_ref(), ctx.opts.exclude.as_ref())
514 {
515 continue;
516 }
517 ctx.frontier.try_enqueue(link.clone(), depth + 1);
518 }
519 }
520
521 let title = {
522 let doc = dom_query::Document::from(html);
523 let t = doc.select("title").text().to_string();
524 (!t.is_empty()).then_some(t)
525 };
526
527 Some(CrawlPageResult {
528 url: url.to_string(),
529 depth,
530 status: CrawlStatus::Ok,
531 title,
532 content: content.map(|c| crate::sanitize::sanitize(&c).into_owned()),
533 error: None,
534 links_found,
535 fetched_at,
536 })
537}
538
539struct FetchOutcome {
541 url: Url,
542 depth: usize,
543 result: Result<bridge::ServoPage, crate::error::Error>,
544 fetched_at: SystemTime,
545}
546
547fn error_result(url: &Url, depth: usize, error: crate::error::Error, fetched_at: SystemTime) -> CrawlPageResult {
548 CrawlPageResult {
549 url: url.to_string(),
550 depth,
551 status: CrawlStatus::Error,
552 title: None,
553 content: None,
554 error: Some(error),
555 links_found: 0,
556 fetched_at,
557 }
558}
559
560#[cfg(test)]
561mod tests {
562 use std::collections::HashMap;
563 use std::sync::Arc;
564
565 use super::*;
566
567 #[test]
568 fn crawl_options_defaults() {
569 let opts = CrawlOptions::new("https://example.com");
570 assert_eq!(opts.url, "https://example.com");
571 assert_eq!(opts.limit, 50);
572 assert_eq!(opts.max_depth, 3);
573 assert_eq!(opts.timeout, Duration::from_secs(30));
574 assert!(opts.include.is_empty());
575 assert!(opts.exclude.is_empty());
576 assert_eq!(opts.concurrency, 1);
577 assert_eq!(opts.delay, Some(Duration::from_millis(500)));
578 }
579
580 #[test]
581 fn crawl_options_chaining() {
582 let opts = CrawlOptions::new("https://example.com")
583 .limit(100)
584 .max_depth(5)
585 .timeout(Duration::from_secs(60))
586 .include(&["/docs/**"])
587 .exclude(&["/docs/archive/**"])
588 .concurrency(4)
589 .delay(None);
590 assert_eq!(opts.limit, 100);
591 assert_eq!(opts.max_depth, 5);
592 assert_eq!(opts.include, vec!["/docs/**"]);
593 assert_eq!(opts.exclude, vec!["/docs/archive/**"]);
594 assert_eq!(opts.concurrency, 4);
595 assert_eq!(opts.delay, None);
596 }
597
598 #[test]
599 fn crawl_options_concurrency_clamps_below_one() {
600 let opts = CrawlOptions::new("https://example.com").concurrency(0);
601 assert_eq!(opts.concurrency, 1);
602 }
603
604 #[test]
605 fn crawl_options_delay_custom_value() {
606 let opts = CrawlOptions::new("https://example.com").delay(Some(Duration::from_secs(2)));
607 assert_eq!(opts.delay, Some(Duration::from_secs(2)));
608 }
609
610 #[test]
611 fn crawl_user_agent_sanitizes_crlf() {
612 let opts = CrawlOptions::new("https://example.com").user_agent("Crawler\r\n/2.0");
613 assert_eq!(opts.user_agent.as_deref(), Some("Crawler /2.0"));
614 }
615
616 #[derive(Clone)]
617 struct MockFetcher(Arc<HashMap<String, String>>);
618
619 impl MockFetcher {
620 fn new(pages: &[(&str, &str)]) -> Self {
621 Self(Arc::new(
622 pages.iter().map(|(u, h)| (u.to_string(), h.to_string())).collect(),
623 ))
624 }
625 }
626
627 impl PageFetcher for MockFetcher {
628 fn fetch_page(&self, opts: bridge::FetchOptions<'_>) -> anyhow::Result<bridge::ServoPage> {
629 self.0
630 .get(opts.url)
631 .map(|html| bridge::ServoPage {
632 html: html.clone(),
633 ..Default::default()
634 })
635 .ok_or_else(|| anyhow::anyhow!("not found: {}", opts.url))
636 }
637 }
638
639 fn page(links: &[&str]) -> String {
640 use std::fmt::Write as _;
641 let mut anchors = String::new();
642 for l in links {
643 write!(anchors, r#"<a href="{l}">link</a>"#).unwrap();
644 }
645 format!("<html><head><title>Test</title></head><body>{anchors}</body></html>")
646 }
647
648 fn distinct_page(tag: &str) -> String {
650 format!("<html><head><title>{tag}</title></head><body>page {tag}</body></html>")
651 }
652
653 async fn check(
655 pages: &[(&str, &str)],
656 configure: impl FnOnce(&mut CrawlPlan),
657 assert: impl FnOnce(&[CrawlPageResult]),
658 ) {
659 let fetcher = MockFetcher::new(pages);
660 let seed = pages[0].0;
661 let mut opts = CrawlPlan {
662 seed: Url::parse(seed).unwrap(),
663 limit: 50,
664 max_depth: 3,
665 timeout_secs: 30,
666 settle_ms: 0,
667 include: None,
668 exclude: None,
669 selector: None,
670 json: false,
671 user_agent: None,
672 concurrency: 1,
673 delay: None,
674 };
675 configure(&mut opts);
676 let mut results = Vec::new();
677 run(opts, RobotsPolicy::Unavailable, &fetcher, |r| results.push(r)).await;
678 assert(&results);
679 }
680
681 #[tokio::test]
682 async fn crawl_single_page() {
683 check(
684 &[("https://example.com/", &page(&[]))],
685 |_| {},
686 |r| {
687 assert_eq!(r.len(), 1);
688 assert_eq!(r[0].url, "https://example.com/");
689 },
690 )
691 .await;
692 }
693
694 #[tokio::test]
695 async fn crawl_follows_links() {
696 check(
697 &[
698 ("https://example.com/", &page(&["/a", "/b"])),
699 (
700 "https://example.com/a",
701 "<html><head><title>A</title></head><body>page a</body></html>",
702 ),
703 (
704 "https://example.com/b",
705 "<html><head><title>B</title></head><body>page b</body></html>",
706 ),
707 ],
708 |_| {},
709 |r| assert_eq!(r.len(), 3),
710 )
711 .await;
712 }
713
714 #[tokio::test]
715 async fn crawl_respects_depth_limit() {
716 check(
717 &[
718 ("https://example.com/", &page(&["/a"])),
719 ("https://example.com/a", &page(&["/b"])),
720 ("https://example.com/b", &page(&["/c"])),
721 ("https://example.com/c", &page(&[])),
722 ],
723 |o| o.max_depth = 1,
724 |r| assert_eq!(r.len(), 2),
725 )
726 .await;
727 }
728
729 #[tokio::test]
730 async fn crawl_respects_limit() {
731 check(
732 &[
733 ("https://example.com/", &page(&["/a", "/b", "/c"])),
734 ("https://example.com/a", &page(&[])),
735 ("https://example.com/b", &page(&[])),
736 ("https://example.com/c", &page(&[])),
737 ],
738 |o| o.limit = 2,
739 |r| assert_eq!(r.len(), 2),
740 )
741 .await;
742 }
743
744 #[tokio::test]
745 async fn crawl_skips_cross_site_links() {
746 check(
747 &[
748 ("https://example.com/", &page(&["https://other.com/x"])),
749 ("https://other.com/x", &page(&[])),
750 ],
751 |_| {},
752 |r| assert_eq!(r.len(), 1),
753 )
754 .await;
755 }
756
757 #[tokio::test]
758 async fn crawl_deduplicates_urls() {
759 check(
760 &[
761 ("https://example.com/", &page(&["/a", "/a", "/a"])),
762 ("https://example.com/a", &page(&["/"])),
763 ],
764 |_| {},
765 |r| assert_eq!(r.len(), 2),
766 )
767 .await;
768 }
769
770 #[tokio::test]
771 async fn crawl_handles_fetch_errors() {
772 check(
773 &[("https://example.com/", &page(&["/missing"]))],
774 |_| {},
775 |r| {
776 assert_eq!(r.len(), 2);
777 assert!(matches!(r[1].status, CrawlStatus::Error));
778 assert!(r[1].error.is_some());
779 },
780 )
781 .await;
782 }
783
784 #[tokio::test]
785 async fn crawl_applies_include_glob() {
786 check(
787 &[
788 ("https://example.com/", &page(&["/docs/a", "/blog/b"])),
789 ("https://example.com/docs/a", &page(&[])),
790 ("https://example.com/blog/b", &page(&[])),
791 ],
792 |o| o.include = Some(crate::scope::build_globset(&["/docs/**".into()]).unwrap()),
793 |r| {
794 assert_eq!(r.len(), 2);
795 assert!(r.iter().any(|p| p.url == "https://example.com/docs/a"));
796 assert!(!r.iter().any(|p| p.url == "https://example.com/blog/b"));
797 },
798 )
799 .await;
800 }
801
802 #[tokio::test]
803 async fn crawl_applies_exclude_glob() {
804 check(
805 &[
806 ("https://example.com/", &page(&["/public", "/secret/data"])),
807 ("https://example.com/public", &page(&[])),
808 ("https://example.com/secret/data", &page(&[])),
809 ],
810 |o| o.exclude = Some(crate::scope::build_globset(&["/secret/**".into()]).unwrap()),
811 |r| {
812 assert_eq!(r.len(), 2);
813 assert!(!r.iter().any(|p| p.url == "https://example.com/secret/data"));
814 },
815 )
816 .await;
817 }
818
819 #[tokio::test]
820 async fn crawl_deduplicates_content() {
821 let same = "<html><head><title>Same</title></head><body>identical</body></html>";
822 check(
823 &[
824 ("https://example.com/", &page(&["/a", "/b"])),
825 ("https://example.com/a", same),
826 ("https://example.com/b", same),
827 ],
828 |_| {},
829 |r| assert_eq!(r.len(), 2),
830 )
831 .await;
832 }
833
834 #[tokio::test]
835 async fn crawl_concurrency_visits_all_pages() {
836 check(
837 &[
838 ("https://example.com/", &page(&["/a", "/b", "/c", "/d"])),
839 ("https://example.com/a", &distinct_page("a")),
840 ("https://example.com/b", &distinct_page("b")),
841 ("https://example.com/c", &distinct_page("c")),
842 ("https://example.com/d", &distinct_page("d")),
843 ],
844 |o| o.concurrency = 4,
845 |r| {
846 assert_eq!(r.len(), 5);
847 let urls: HashSet<&str> = r.iter().map(|p| p.url.as_str()).collect();
848 for u in [
849 "https://example.com/",
850 "https://example.com/a",
851 "https://example.com/b",
852 "https://example.com/c",
853 "https://example.com/d",
854 ] {
855 assert!(urls.contains(u), "missing {u}");
856 }
857 },
858 )
859 .await;
860 }
861
862 #[tokio::test]
863 async fn crawl_concurrency_respects_limit() {
864 check(
865 &[
866 ("https://example.com/", &page(&["/a", "/b", "/c", "/d"])),
867 ("https://example.com/a", &distinct_page("a")),
868 ("https://example.com/b", &distinct_page("b")),
869 ("https://example.com/c", &distinct_page("c")),
870 ("https://example.com/d", &distinct_page("d")),
871 ],
872 |o| {
873 o.concurrency = 4;
874 o.limit = 3;
875 },
876 |r| assert_eq!(r.len(), 3),
877 )
878 .await;
879 }
880
881 #[tokio::test]
882 async fn crawl_concurrency_one_preserves_bfs_order() {
883 check(
884 &[
885 ("https://example.com/", &page(&["/a", "/b"])),
886 ("https://example.com/a", &distinct_page("a")),
887 ("https://example.com/b", &distinct_page("b")),
888 ],
889 |o| o.concurrency = 1,
890 |r| {
891 assert_eq!(r.len(), 3);
892 assert_eq!(r[0].url, "https://example.com/");
893 assert_eq!(r[1].url, "https://example.com/a");
894 assert_eq!(r[2].url, "https://example.com/b");
895 },
896 )
897 .await;
898 }
899
900 #[tokio::test(start_paused = true)]
901 async fn crawl_delay_enforces_minimum_interval() {
902 let start = tokio::time::Instant::now();
904 check(
905 &[
906 ("https://example.com/", &page(&["/a", "/b"])),
907 ("https://example.com/a", &distinct_page("a")),
908 ("https://example.com/b", &distinct_page("b")),
909 ],
910 |o| {
911 o.concurrency = 1;
912 o.delay = Some(Duration::from_millis(500));
913 },
914 |r| assert_eq!(r.len(), 3),
915 )
916 .await;
917 let elapsed = start.elapsed();
918 assert!(
919 elapsed >= Duration::from_secs(1),
920 "expected >= 1s for 3 pages with 500ms delay, got {elapsed:?}"
921 );
922 }
923
924 #[test]
925 fn frontier_dedup() {
926 let seed = Url::parse("https://example.com/").unwrap();
927 let mut f = Frontier::new(&seed);
928 assert!(!f.try_enqueue(seed, 0));
929 let other = Url::parse("https://example.com/page").unwrap();
930 assert!(f.try_enqueue(other.clone(), 1));
931 assert!(!f.try_enqueue(other, 1));
932 }
933
934 #[test]
935 fn frontier_pop_and_pending() {
936 let seed = Url::parse("https://example.com/").unwrap();
937 let mut f = Frontier::new(&seed);
938 assert_eq!(f.pending(), 1);
939 let (url, depth) = f.pop().unwrap();
940 assert_eq!(url.as_str(), "https://example.com/");
941 assert_eq!(depth, 0);
942 assert_eq!(f.pending(), 0);
943 assert!(f.pop().is_none());
944 }
945
946 #[test]
947 fn extract_links_filters_dangerous_schemes() {
948 let html = r#"<a href="https://example.com/a">A</a>
949 <a href="javascript:void(0)">JS</a>
950 <a href="JAVASCRIPT:alert(1)">JS upper</a>
951 <a href="data:text/html,<h1>hi</h1>">Data</a>
952 <a href="mailto:x@y.com">Mail</a>
953 <a href="/relative">Rel</a>"#;
954 let base = Url::parse("https://example.com/").unwrap();
955 let links = extract_links_from_html(html, &base);
956 assert_eq!(links.len(), 2);
957 assert_eq!(links[0].as_str(), "https://example.com/a");
958 assert_eq!(links[1].as_str(), "https://example.com/relative");
959 }
960
961 #[test]
962 fn error_result_fields() {
963 let url = Url::parse("https://example.com/fail").unwrap();
964 let r = error_result(&url, 2, crate::error::Error::engine("timeout", None), SystemTime::now());
965 assert!(matches!(r.status, CrawlStatus::Error));
966 assert!(r.error.as_ref().is_some_and(|e| e.to_string().contains("timeout")));
967 assert!(r.content.is_none());
968 }
969
970 #[test]
971 fn content_hash_dedup() {
972 let seed = Url::parse("https://example.com/").unwrap();
973 let mut f = Frontier::new(&seed);
974 assert!(!f.is_duplicate_content("unique content"));
975 assert!(f.is_duplicate_content("unique content"));
976 assert!(!f.is_duplicate_content("different content"));
977 }
978}