1use std::time::Duration;
4
5use crate::error::Error;
6
7#[derive(Debug, Clone, Default, serde::Serialize)]
9#[non_exhaustive]
10pub struct Page {
11 pub html: String,
13 pub inner_text: String,
15 pub title: Option<String>,
17 #[serde(skip_serializing_if = "Option::is_none")]
19 pub layout_json: Option<String>,
20 #[serde(skip_serializing_if = "Option::is_none")]
22 pub js_result: Option<String>,
23 pub console_messages: Vec<ConsoleMessage>,
25 #[serde(skip_serializing_if = "Option::is_none")]
27 pub accessibility_tree: Option<String>,
28 #[serde(skip)]
29 screenshot_png: Option<Vec<u8>>,
30}
31
32impl Page {
33 pub fn markdown(&self) -> crate::error::Result<String> {
35 self.markdown_with_url("")
36 }
37
38 pub fn markdown_with_url(&self, url: &str) -> crate::error::Result<String> {
40 let input = crate::extract::ExtractInput::new(&self.html, url)
41 .with_layout_json(self.layout_json.as_deref())
42 .with_inner_text(Some(&self.inner_text));
43 Ok(crate::extract::extract_text(&input)?)
44 }
45
46 pub fn extract_json(&self) -> crate::error::Result<String> {
48 self.extract_json_with_url("")
49 }
50
51 pub fn extract_json_with_url(&self, url: &str) -> crate::error::Result<String> {
53 let input = crate::extract::ExtractInput::new(&self.html, url)
54 .with_layout_json(self.layout_json.as_deref())
55 .with_inner_text(Some(&self.inner_text));
56 Ok(crate::extract::extract_json(&input)?)
57 }
58
59 pub fn markdown_with_selector(&self, url: &str, selector: &str) -> crate::error::Result<String> {
61 let input = crate::extract::ExtractInput::new(&self.html, url)
62 .with_layout_json(self.layout_json.as_deref())
63 .with_inner_text(Some(&self.inner_text))
64 .with_selector(Some(selector));
65 Ok(crate::extract::extract_text(&input)?)
66 }
67
68 pub fn extract_json_with_selector(&self, url: &str, selector: &str) -> crate::error::Result<String> {
70 let input = crate::extract::ExtractInput::new(&self.html, url)
71 .with_layout_json(self.layout_json.as_deref())
72 .with_inner_text(Some(&self.inner_text))
73 .with_selector(Some(selector));
74 Ok(crate::extract::extract_json(&input)?)
75 }
76
77 #[must_use]
79 pub fn screenshot_png(&self) -> Option<&[u8]> {
80 self.screenshot_png.as_deref()
81 }
82
83 pub(crate) fn from_servo(page: crate::bridge::ServoPage) -> Self {
84 let title = {
85 let doc = dom_query::Document::from(page.html.as_str());
86 let t = doc.select("title").text().to_string();
87 if t.is_empty() { None } else { Some(t) }
88 };
89 let screenshot_png = page.screenshot.and_then(|img| {
90 let mut buf = std::io::Cursor::new(Vec::new());
91 img.write_to(&mut buf, image::ImageFormat::Png).ok()?;
92 Some(buf.into_inner())
93 });
94 Self {
95 html: page.html,
96 inner_text: page.inner_text.unwrap_or_default(),
97 title,
98 layout_json: page.layout_json,
99 js_result: page.js_result,
100 console_messages: page
101 .console_messages
102 .into_iter()
103 .map(|m| ConsoleMessage {
104 level: match m.level {
105 crate::bridge::ConsoleLevel::Log => ConsoleLevel::Log,
106 crate::bridge::ConsoleLevel::Debug => ConsoleLevel::Debug,
107 crate::bridge::ConsoleLevel::Info => ConsoleLevel::Info,
108 crate::bridge::ConsoleLevel::Warn => ConsoleLevel::Warn,
109 crate::bridge::ConsoleLevel::Error => ConsoleLevel::Error,
110 crate::bridge::ConsoleLevel::Trace => ConsoleLevel::Trace,
111 },
112 message: m.message,
113 })
114 .collect(),
115 screenshot_png,
116 accessibility_tree: page.accessibility_tree,
117 }
118 }
119}
120
121#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
123#[non_exhaustive]
124pub struct ConsoleMessage {
125 pub level: ConsoleLevel,
127 pub message: String,
129}
130
131#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
133#[serde(rename_all = "lowercase")]
134#[non_exhaustive]
135pub enum ConsoleLevel {
136 Log,
138 Debug,
140 Info,
142 Warn,
144 Error,
146 Trace,
148}
149
150impl ConsoleLevel {
151 #[must_use]
153 pub fn as_str(&self) -> &'static str {
154 match self {
155 Self::Log => "log",
156 Self::Debug => "debug",
157 Self::Info => "info",
158 Self::Warn => "warn",
159 Self::Error => "error",
160 Self::Trace => "trace",
161 }
162 }
163}
164
165impl std::fmt::Display for ConsoleLevel {
166 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
167 f.pad(self.as_str())
168 }
169}
170
171#[derive(Debug, Clone, Default)]
172pub(crate) enum FetchMode {
173 #[default]
174 Content,
175 Screenshot {
176 full_page: bool,
177 },
178 JavaScript(String),
179}
180
181#[must_use = "options do nothing until passed to fetch()"]
188#[derive(Debug, Clone)]
189pub struct FetchOptions {
190 pub(crate) url: String,
191 pub(crate) timeout: Duration,
192 pub(crate) settle: Duration,
193 pub(crate) mode: FetchMode,
194 pub(crate) user_agent: Option<String>,
195}
196
197impl FetchOptions {
198 pub fn new(url: &str) -> Self {
200 Self {
201 url: url.into(),
202 timeout: Duration::from_secs(30),
203 settle: Duration::ZERO,
204 mode: FetchMode::Content,
205 user_agent: None,
206 }
207 }
208
209 pub fn screenshot(url: &str, full_page: bool) -> Self {
211 Self {
212 mode: FetchMode::Screenshot { full_page },
213 ..Self::new(url)
214 }
215 }
216
217 pub fn javascript(url: &str, expression: impl Into<String>) -> Self {
219 Self {
220 mode: FetchMode::JavaScript(expression.into()),
221 ..Self::new(url)
222 }
223 }
224
225 pub fn timeout(mut self, timeout: Duration) -> Self {
227 self.timeout = timeout;
228 self
229 }
230
231 pub fn settle(mut self, settle: Duration) -> Self {
233 self.settle = settle;
234 self
235 }
236
237 pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
239 self.user_agent = Some(sanitize_user_agent(ua.into()));
240 self
241 }
242}
243
244#[allow(clippy::needless_pass_by_value)]
249pub fn fetch(opts: FetchOptions) -> crate::error::Result<Page> {
250 ensure_crypto_provider();
251
252 crate::net::validate_url(&opts.url).map_err(|e| map_url_error(&opts.url, e))?;
253
254 if matches!(opts.mode, FetchMode::Content)
255 && let Some(bytes) = crate::pdf::probe(&opts.url, opts.timeout.as_secs().max(1))
256 {
257 let text = crate::extract::extract_pdf(&bytes);
258 return Ok(Page {
259 html: String::new(),
260 inner_text: text,
261 ..Page::default()
262 });
263 }
264
265 let bridge_opts = crate::bridge::FetchOptions {
266 url: &opts.url,
267 timeout_secs: opts.timeout.as_secs().max(1),
268 settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
269 user_agent: opts.user_agent.as_deref(),
270 mode: match opts.mode {
271 FetchMode::Content => crate::bridge::FetchMode::Content { include_a11y: false },
272 FetchMode::Screenshot { full_page } => crate::bridge::FetchMode::Screenshot { full_page },
273 FetchMode::JavaScript(ref expr) => crate::bridge::FetchMode::ExecuteJs {
274 expression: expr.clone(),
275 },
276 },
277 };
278
279 let servo_page = crate::bridge::fetch_page(bridge_opts).map_err(|e| {
280 let msg = format!("{e:#}");
281 if msg.contains("timed out") {
282 Error::Timeout {
283 url: opts.url.clone(),
284 timeout: opts.timeout,
285 }
286 } else {
287 Error::Engine(msg)
288 }
289 })?;
290
291 Ok(Page::from_servo(servo_page))
292}
293
294#[must_use = "options do nothing until passed to crawl() or crawl_each()"]
296#[derive(Debug, Clone)]
297pub struct CrawlOptions {
298 pub(crate) url: String,
299 pub(crate) limit: usize,
300 pub(crate) max_depth: usize,
301 pub(crate) timeout: Duration,
302 pub(crate) settle: Duration,
303 pub(crate) include: Vec<String>,
304 pub(crate) exclude: Vec<String>,
305 pub(crate) selector: Option<String>,
306 pub(crate) json: bool,
307 pub(crate) user_agent: Option<String>,
308}
309
310impl CrawlOptions {
311 pub fn new(url: &str) -> Self {
313 Self {
314 url: url.into(),
315 limit: 50,
316 max_depth: 3,
317 timeout: Duration::from_secs(30),
318 settle: Duration::ZERO,
319 include: Vec::new(),
320 exclude: Vec::new(),
321 selector: None,
322 json: false,
323 user_agent: None,
324 }
325 }
326
327 pub fn limit(mut self, n: usize) -> Self {
329 self.limit = n;
330 self
331 }
332
333 pub fn max_depth(mut self, n: usize) -> Self {
335 self.max_depth = n;
336 self
337 }
338
339 pub fn timeout(mut self, timeout: Duration) -> Self {
341 self.timeout = timeout;
342 self
343 }
344
345 pub fn settle(mut self, settle: Duration) -> Self {
347 self.settle = settle;
348 self
349 }
350
351 pub fn include(mut self, patterns: &[&str]) -> Self {
353 self.include = patterns.iter().map(|s| (*s).to_string()).collect();
354 self
355 }
356
357 pub fn exclude(mut self, patterns: &[&str]) -> Self {
359 self.exclude = patterns.iter().map(|s| (*s).to_string()).collect();
360 self
361 }
362
363 pub fn json(mut self, json: bool) -> Self {
365 self.json = json;
366 self
367 }
368
369 pub fn selector(mut self, selector: impl Into<String>) -> Self {
371 self.selector = Some(selector.into());
372 self
373 }
374
375 pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
377 self.user_agent = Some(sanitize_user_agent(ua.into()));
378 self
379 }
380}
381
382#[derive(Debug, Clone)]
384#[non_exhaustive]
385pub struct CrawlResult {
386 pub url: String,
388 pub depth: usize,
390 pub outcome: Result<CrawlPage, CrawlError>,
392}
393
394#[derive(Debug, Clone)]
396pub struct CrawlPage {
397 pub title: Option<String>,
399 pub content: String,
401 pub links_found: usize,
403}
404
405#[derive(Debug, Clone)]
407pub struct CrawlError {
408 pub message: String,
410}
411
412impl std::fmt::Display for CrawlError {
413 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
414 f.write_str(&self.message)
415 }
416}
417
418impl std::error::Error for CrawlError {}
419
420impl serde::Serialize for CrawlResult {
421 fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
422 use serde::ser::SerializeMap;
423 match &self.outcome {
424 Ok(page) => {
425 let mut map = serializer.serialize_map(None)?;
426 map.serialize_entry("url", &self.url)?;
427 map.serialize_entry("depth", &self.depth)?;
428 map.serialize_entry("status", "ok")?;
429 if let Some(t) = &page.title {
430 map.serialize_entry("title", t)?;
431 }
432 map.serialize_entry("content", &page.content)?;
433 map.serialize_entry("links_found", &page.links_found)?;
434 map.end()
435 }
436 Err(e) => {
437 let mut map = serializer.serialize_map(None)?;
438 map.serialize_entry("url", &self.url)?;
439 map.serialize_entry("depth", &self.depth)?;
440 map.serialize_entry("status", "error")?;
441 map.serialize_entry("error", &e.message)?;
442 map.end()
443 }
444 }
445 }
446}
447
448impl CrawlResult {
449 fn from_internal(r: &crate::crawl::CrawlPageResult) -> Self {
450 let outcome = match r.status {
451 crate::crawl::CrawlStatus::Ok => Ok(CrawlPage {
452 title: r.title.clone(),
453 content: r.content.clone().unwrap_or_default(),
454 links_found: r.links_found,
455 }),
456 crate::crawl::CrawlStatus::Error => Err(CrawlError {
457 message: r.error.clone().unwrap_or_default(),
458 }),
459 };
460 Self {
461 url: r.url.clone(),
462 depth: r.depth,
463 outcome,
464 }
465 }
466}
467
468#[allow(clippy::needless_pass_by_value)]
470pub fn crawl_each(opts: CrawlOptions, mut on_page: impl FnMut(&CrawlResult)) -> crate::error::Result<()> {
471 ensure_crypto_provider();
472 let internal_opts = build_crawl_options(&opts)?;
473 crate::runtime::block_on(crate::crawl::run(internal_opts, |r| {
474 on_page(&CrawlResult::from_internal(r));
475 }))
476 .map_err(|e| Error::Engine(e.to_string()))?;
477 Ok(())
478}
479
480#[allow(clippy::needless_pass_by_value)]
482pub fn crawl(opts: CrawlOptions) -> crate::error::Result<Vec<CrawlResult>> {
483 let mut results = Vec::new();
484 crawl_each(opts, |r| results.push(r.clone()))?;
485 Ok(results)
486}
487
488pub fn markdown(url: &str) -> crate::error::Result<String> {
490 fetch(FetchOptions::new(url))?.markdown_with_url(url)
491}
492
493pub fn extract_json(url: &str) -> crate::error::Result<String> {
495 fetch(FetchOptions::new(url))?.extract_json_with_url(url)
496}
497
498pub fn text(url: &str) -> crate::error::Result<String> {
500 Ok(fetch(FetchOptions::new(url))?.inner_text)
501}
502
503pub fn validate_url(url: &str) -> crate::error::Result<url::Url> {
505 crate::net::validate_url(url).map_err(|e| map_url_error(url, e))
506}
507
508fn ensure_crypto_provider() {
509 let _ = rustls::crypto::aws_lc_rs::default_provider().install_default();
510}
511
512pub(crate) fn sanitize_user_agent(ua: String) -> String {
514 if ua.bytes().any(|b| b == b'\r' || b == b'\n' || b == 0) {
515 ua.replace(['\r', '\n', '\0'], " ")
516 } else {
517 ua
518 }
519}
520
521fn map_url_error(url: &str, e: crate::net::UrlError) -> Error {
522 match e {
523 crate::net::UrlError::PrivateAddress(host) => Error::AddressNotAllowed(host),
524 crate::net::UrlError::Invalid(reason) => Error::InvalidUrl {
525 url: url.into(),
526 reason,
527 },
528 }
529}
530
531fn build_crawl_options(opts: &CrawlOptions) -> crate::error::Result<crate::crawl::CrawlOptions> {
532 let seed = crate::net::validate_url(&opts.url).map_err(|e| map_url_error(&opts.url, e))?;
533 let include = if opts.include.is_empty() {
534 None
535 } else {
536 Some(crate::crawl::build_globset(&opts.include).map_err(|e| Error::Engine(e.to_string()))?)
537 };
538 let exclude = if opts.exclude.is_empty() {
539 None
540 } else {
541 Some(crate::crawl::build_globset(&opts.exclude).map_err(|e| Error::Engine(e.to_string()))?)
542 };
543 Ok(crate::crawl::CrawlOptions {
544 seed,
545 limit: opts.limit,
546 max_depth: opts.max_depth,
547 timeout_secs: opts.timeout.as_secs().max(1),
548 settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
549 include,
550 exclude,
551 selector: opts.selector.clone(),
552 json: opts.json,
553 user_agent: opts.user_agent.clone(),
554 })
555}
556
557#[cfg(test)]
558mod tests {
559 use super::*;
560
561 #[test]
562 fn fetch_options_defaults() {
563 let opts = FetchOptions::new("https://example.com");
564 assert_eq!(opts.url, "https://example.com");
565 assert_eq!(opts.timeout, Duration::from_secs(30));
566 assert_eq!(opts.settle, Duration::ZERO);
567 assert!(matches!(opts.mode, FetchMode::Content));
568 }
569
570 #[test]
571 fn fetch_options_screenshot() {
572 let opts = FetchOptions::screenshot("https://example.com", true);
573 assert!(matches!(opts.mode, FetchMode::Screenshot { full_page: true }));
574 }
575
576 #[test]
577 fn fetch_options_javascript() {
578 let opts = FetchOptions::javascript("https://example.com", "document.title");
579 assert!(matches!(opts.mode, FetchMode::JavaScript(ref e) if e == "document.title"));
580 }
581
582 #[test]
583 fn fetch_options_chaining() {
584 let opts = FetchOptions::new("https://example.com")
585 .timeout(Duration::from_secs(60))
586 .settle(Duration::from_millis(500));
587 assert_eq!(opts.timeout, Duration::from_secs(60));
588 assert_eq!(opts.settle, Duration::from_millis(500));
589 }
590
591 #[test]
592 fn crawl_options_defaults() {
593 let opts = CrawlOptions::new("https://example.com");
594 assert_eq!(opts.url, "https://example.com");
595 assert_eq!(opts.limit, 50);
596 assert_eq!(opts.max_depth, 3);
597 assert_eq!(opts.timeout, Duration::from_secs(30));
598 assert!(opts.include.is_empty());
599 assert!(opts.exclude.is_empty());
600 }
601
602 #[test]
603 fn crawl_options_chaining() {
604 let opts = CrawlOptions::new("https://example.com")
605 .limit(100)
606 .max_depth(5)
607 .timeout(Duration::from_secs(60))
608 .include(&["/docs/**"])
609 .exclude(&["/docs/archive/**"]);
610 assert_eq!(opts.limit, 100);
611 assert_eq!(opts.max_depth, 5);
612 assert_eq!(opts.include, vec!["/docs/**"]);
613 assert_eq!(opts.exclude, vec!["/docs/archive/**"]);
614 }
615
616 #[test]
617 fn fetch_user_agent_set() {
618 let opts = FetchOptions::new("https://example.com").user_agent("MyBot/1.0");
619 assert_eq!(opts.user_agent.as_deref(), Some("MyBot/1.0"));
620 }
621
622 #[test]
623 fn fetch_user_agent_default_is_none() {
624 let opts = FetchOptions::new("https://example.com");
625 assert!(opts.user_agent.is_none());
626 }
627
628 #[test]
629 fn fetch_user_agent_sanitizes_crlf() {
630 let opts = FetchOptions::new("https://example.com").user_agent("Bot\r\nX-Evil: yes");
631 assert_eq!(opts.user_agent.as_deref(), Some("Bot X-Evil: yes"));
632 }
633
634 #[test]
635 fn fetch_user_agent_sanitizes_null() {
636 let opts = FetchOptions::new("https://example.com").user_agent("Bot\0/1.0");
637 assert_eq!(opts.user_agent.as_deref(), Some("Bot /1.0"));
638 }
639
640 #[test]
641 fn fetch_user_agent_empty_string() {
642 let opts = FetchOptions::new("https://example.com").user_agent("");
643 assert_eq!(opts.user_agent.as_deref(), Some(""));
644 }
645
646 #[test]
647 fn crawl_user_agent_sanitizes_crlf() {
648 let opts = CrawlOptions::new("https://example.com").user_agent("Crawler\r\n/2.0");
649 assert_eq!(opts.user_agent.as_deref(), Some("Crawler /2.0"));
650 }
651
652 #[test]
653 fn page_markdown_from_html() {
654 let page = Page {
655 html: "<html><head><title>Test</title></head><body><p>hello world</p></body></html>".into(),
656 inner_text: "hello world".into(),
657 ..Page::default()
658 };
659 let md = page.markdown().unwrap();
660 assert!(md.contains("hello world"));
661 }
662
663 #[test]
664 fn page_extract_json_produces_valid_json() {
665 let page = Page {
666 html: "<html><head><title>Test</title></head><body><p>content</p></body></html>".into(),
667 inner_text: "content".into(),
668 ..Page::default()
669 };
670 let json = page.extract_json().unwrap();
671 let _: serde_json::Value = serde_json::from_str(&json).expect("valid JSON");
672 }
673
674 #[test]
675 fn page_screenshot_png_none_by_default() {
676 let page = Page::default();
677 assert!(page.screenshot_png().is_none());
678 }
679
680 #[test]
681 fn page_markdown_with_selector_scopes_to_subtree() {
682 let page = Page {
683 html: "<html><body><article>keep</article><aside>drop</aside></body></html>".into(),
684 ..Page::default()
685 };
686 let md = page.markdown_with_selector("https://example.com", "article").unwrap();
687 assert!(md.contains("keep"));
688 assert!(!md.contains("drop"));
689 }
690
691 #[test]
692 fn page_extract_json_with_selector_includes_url() {
693 let page = Page {
694 html: "<html><body><article>scoped</article></body></html>".into(),
695 ..Page::default()
696 };
697 let json = page
698 .extract_json_with_selector("https://example.com/page", "article")
699 .unwrap();
700 let parsed: serde_json::Value = serde_json::from_str(&json).expect("valid JSON");
701 assert_eq!(parsed["url"].as_str(), Some("https://example.com/page"));
702 assert!(parsed["text_content"].as_str().unwrap().contains("scoped"));
703 }
704
705 #[test]
706 fn page_markdown_with_selector_no_match_returns_empty() {
707 let page = Page {
708 html: "<html><body><article>x</article></body></html>".into(),
709 ..Page::default()
710 };
711 let md = page.markdown_with_selector("", ".nonexistent").unwrap();
712 assert!(md.is_empty());
713 }
714
715 #[test]
716 fn fetch_rejects_invalid_url() {
717 let result = fetch(FetchOptions::new("not a url"));
718 assert!(result.is_err());
719 let err = result.unwrap_err();
720 assert!(matches!(err, Error::InvalidUrl { .. }));
721 }
722
723 #[test]
724 fn fetch_rejects_private_ip() {
725 let result = fetch(FetchOptions::new("http://127.0.0.1/"));
726 assert!(result.is_err());
727 }
728
729 #[test]
730 fn fetch_rejects_file_scheme() {
731 let result = fetch(FetchOptions::new("file:///etc/passwd"));
732 assert!(result.is_err());
733 }
734}