1use std::time::Duration;
4
5use crate::error::Error;
6
7#[derive(Debug, Clone, Default, serde::Serialize)]
9#[non_exhaustive]
10pub struct Page {
11 pub html: String,
13 pub inner_text: String,
15 pub title: Option<String>,
17 #[serde(skip_serializing_if = "Option::is_none")]
19 pub layout_json: Option<String>,
20 #[serde(skip_serializing_if = "Option::is_none")]
22 pub js_result: Option<String>,
23 pub console_messages: Vec<ConsoleMessage>,
25 #[serde(skip_serializing_if = "Option::is_none")]
27 pub accessibility_tree: Option<String>,
28 #[serde(skip)]
29 screenshot_png: Option<Vec<u8>>,
30}
31
32impl Page {
33 pub fn markdown(&self) -> crate::error::Result<String> {
35 self.markdown_with_url("")
36 }
37
38 pub fn markdown_with_url(&self, url: &str) -> crate::error::Result<String> {
40 let input = crate::extract::ExtractInput::new(&self.html, url)
41 .with_layout_json(self.layout_json.as_deref())
42 .with_inner_text(Some(&self.inner_text));
43 Ok(crate::extract::extract_text(&input)?)
44 }
45
46 pub fn extract_json(&self) -> crate::error::Result<String> {
48 self.extract_json_with_url("")
49 }
50
51 pub fn extract_json_with_url(&self, url: &str) -> crate::error::Result<String> {
53 let input = crate::extract::ExtractInput::new(&self.html, url)
54 .with_layout_json(self.layout_json.as_deref())
55 .with_inner_text(Some(&self.inner_text));
56 Ok(crate::extract::extract_json(&input)?)
57 }
58
59 #[must_use]
61 pub fn screenshot_png(&self) -> Option<&[u8]> {
62 self.screenshot_png.as_deref()
63 }
64
65 pub(crate) fn from_servo(page: crate::bridge::ServoPage) -> Self {
66 let title = {
67 let doc = dom_query::Document::from(page.html.as_str());
68 let t = doc.select("title").text().to_string();
69 if t.is_empty() { None } else { Some(t) }
70 };
71 let screenshot_png = page.screenshot.and_then(|img| {
72 let mut buf = std::io::Cursor::new(Vec::new());
73 img.write_to(&mut buf, image::ImageFormat::Png).ok()?;
74 Some(buf.into_inner())
75 });
76 Self {
77 html: page.html,
78 inner_text: page.inner_text.unwrap_or_default(),
79 title,
80 layout_json: page.layout_json,
81 js_result: page.js_result,
82 console_messages: page
83 .console_messages
84 .into_iter()
85 .map(|m| ConsoleMessage {
86 level: match m.level {
87 crate::bridge::ConsoleLevel::Log => ConsoleLevel::Log,
88 crate::bridge::ConsoleLevel::Debug => ConsoleLevel::Debug,
89 crate::bridge::ConsoleLevel::Info => ConsoleLevel::Info,
90 crate::bridge::ConsoleLevel::Warn => ConsoleLevel::Warn,
91 crate::bridge::ConsoleLevel::Error => ConsoleLevel::Error,
92 crate::bridge::ConsoleLevel::Trace => ConsoleLevel::Trace,
93 },
94 message: m.message,
95 })
96 .collect(),
97 screenshot_png,
98 accessibility_tree: page.accessibility_tree,
99 }
100 }
101}
102
103#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
105#[non_exhaustive]
106pub struct ConsoleMessage {
107 pub level: ConsoleLevel,
109 pub message: String,
111}
112
113#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
115#[serde(rename_all = "lowercase")]
116#[non_exhaustive]
117pub enum ConsoleLevel {
118 Log,
120 Debug,
122 Info,
124 Warn,
126 Error,
128 Trace,
130}
131
132impl ConsoleLevel {
133 #[must_use]
135 pub fn as_str(&self) -> &'static str {
136 match self {
137 Self::Log => "log",
138 Self::Debug => "debug",
139 Self::Info => "info",
140 Self::Warn => "warn",
141 Self::Error => "error",
142 Self::Trace => "trace",
143 }
144 }
145}
146
147impl std::fmt::Display for ConsoleLevel {
148 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
149 f.pad(self.as_str())
150 }
151}
152
153#[derive(Debug, Clone, Default)]
154pub(crate) enum FetchMode {
155 #[default]
156 Content,
157 Screenshot {
158 full_page: bool,
159 },
160 JavaScript(String),
161}
162
163#[must_use = "options do nothing until passed to fetch()"]
170#[derive(Debug, Clone)]
171pub struct FetchOptions {
172 pub(crate) url: String,
173 pub(crate) timeout: Duration,
174 pub(crate) settle: Duration,
175 pub(crate) mode: FetchMode,
176 pub(crate) user_agent: Option<String>,
177}
178
179impl FetchOptions {
180 pub fn new(url: &str) -> Self {
182 Self {
183 url: url.into(),
184 timeout: Duration::from_secs(30),
185 settle: Duration::ZERO,
186 mode: FetchMode::Content,
187 user_agent: None,
188 }
189 }
190
191 pub fn screenshot(url: &str, full_page: bool) -> Self {
193 Self {
194 mode: FetchMode::Screenshot { full_page },
195 ..Self::new(url)
196 }
197 }
198
199 pub fn javascript(url: &str, expression: impl Into<String>) -> Self {
201 Self {
202 mode: FetchMode::JavaScript(expression.into()),
203 ..Self::new(url)
204 }
205 }
206
207 pub fn timeout(mut self, timeout: Duration) -> Self {
209 self.timeout = timeout;
210 self
211 }
212
213 pub fn settle(mut self, settle: Duration) -> Self {
215 self.settle = settle;
216 self
217 }
218
219 pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
221 self.user_agent = Some(sanitize_user_agent(ua.into()));
222 self
223 }
224}
225
226#[allow(clippy::needless_pass_by_value)]
231pub fn fetch(opts: FetchOptions) -> crate::error::Result<Page> {
232 ensure_crypto_provider();
233
234 crate::net::validate_url(&opts.url).map_err(|e| map_url_error(&opts.url, e))?;
235
236 if matches!(opts.mode, FetchMode::Content)
237 && let Some(bytes) = crate::pdf::probe(&opts.url, opts.timeout.as_secs().max(1))
238 {
239 let text = crate::extract::extract_pdf(&bytes);
240 return Ok(Page {
241 html: String::new(),
242 inner_text: text,
243 ..Page::default()
244 });
245 }
246
247 let bridge_opts = crate::bridge::FetchOptions {
248 url: &opts.url,
249 timeout_secs: opts.timeout.as_secs().max(1),
250 settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
251 user_agent: opts.user_agent.as_deref(),
252 mode: match opts.mode {
253 FetchMode::Content => crate::bridge::FetchMode::Content { include_a11y: false },
254 FetchMode::Screenshot { full_page } => crate::bridge::FetchMode::Screenshot { full_page },
255 FetchMode::JavaScript(ref expr) => crate::bridge::FetchMode::ExecuteJs {
256 expression: expr.clone(),
257 },
258 },
259 };
260
261 let servo_page = crate::bridge::fetch_page(bridge_opts).map_err(|e| {
262 let msg = format!("{e:#}");
263 if msg.contains("timed out") {
264 Error::Timeout {
265 url: opts.url.clone(),
266 timeout: opts.timeout,
267 }
268 } else {
269 Error::Engine(msg)
270 }
271 })?;
272
273 Ok(Page::from_servo(servo_page))
274}
275
276#[must_use = "options do nothing until passed to crawl() or crawl_each()"]
278#[derive(Debug, Clone)]
279pub struct CrawlOptions {
280 pub(crate) url: String,
281 pub(crate) limit: usize,
282 pub(crate) max_depth: usize,
283 pub(crate) timeout: Duration,
284 pub(crate) settle: Duration,
285 pub(crate) include: Vec<String>,
286 pub(crate) exclude: Vec<String>,
287 pub(crate) selector: Option<String>,
288 pub(crate) json: bool,
289 pub(crate) user_agent: Option<String>,
290}
291
292impl CrawlOptions {
293 pub fn new(url: &str) -> Self {
295 Self {
296 url: url.into(),
297 limit: 50,
298 max_depth: 3,
299 timeout: Duration::from_secs(30),
300 settle: Duration::ZERO,
301 include: Vec::new(),
302 exclude: Vec::new(),
303 selector: None,
304 json: false,
305 user_agent: None,
306 }
307 }
308
309 pub fn limit(mut self, n: usize) -> Self {
311 self.limit = n;
312 self
313 }
314
315 pub fn max_depth(mut self, n: usize) -> Self {
317 self.max_depth = n;
318 self
319 }
320
321 pub fn timeout(mut self, timeout: Duration) -> Self {
323 self.timeout = timeout;
324 self
325 }
326
327 pub fn settle(mut self, settle: Duration) -> Self {
329 self.settle = settle;
330 self
331 }
332
333 pub fn include(mut self, patterns: &[&str]) -> Self {
335 self.include = patterns.iter().map(|s| (*s).to_string()).collect();
336 self
337 }
338
339 pub fn exclude(mut self, patterns: &[&str]) -> Self {
341 self.exclude = patterns.iter().map(|s| (*s).to_string()).collect();
342 self
343 }
344
345 pub fn json(mut self, json: bool) -> Self {
347 self.json = json;
348 self
349 }
350
351 pub fn selector(mut self, selector: impl Into<String>) -> Self {
353 self.selector = Some(selector.into());
354 self
355 }
356
357 pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
359 self.user_agent = Some(sanitize_user_agent(ua.into()));
360 self
361 }
362}
363
364#[derive(Debug, Clone)]
366#[non_exhaustive]
367pub struct CrawlResult {
368 pub url: String,
370 pub depth: usize,
372 pub outcome: Result<CrawlPage, CrawlError>,
374}
375
376#[derive(Debug, Clone)]
378pub struct CrawlPage {
379 pub title: Option<String>,
381 pub content: String,
383 pub links_found: usize,
385}
386
387#[derive(Debug, Clone)]
389pub struct CrawlError {
390 pub message: String,
392}
393
394impl std::fmt::Display for CrawlError {
395 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
396 f.write_str(&self.message)
397 }
398}
399
400impl std::error::Error for CrawlError {}
401
402impl serde::Serialize for CrawlResult {
403 fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
404 use serde::ser::SerializeMap;
405 match &self.outcome {
406 Ok(page) => {
407 let mut map = serializer.serialize_map(None)?;
408 map.serialize_entry("url", &self.url)?;
409 map.serialize_entry("depth", &self.depth)?;
410 map.serialize_entry("status", "ok")?;
411 if let Some(t) = &page.title {
412 map.serialize_entry("title", t)?;
413 }
414 map.serialize_entry("content", &page.content)?;
415 map.serialize_entry("links_found", &page.links_found)?;
416 map.end()
417 }
418 Err(e) => {
419 let mut map = serializer.serialize_map(None)?;
420 map.serialize_entry("url", &self.url)?;
421 map.serialize_entry("depth", &self.depth)?;
422 map.serialize_entry("status", "error")?;
423 map.serialize_entry("error", &e.message)?;
424 map.end()
425 }
426 }
427 }
428}
429
430impl CrawlResult {
431 fn from_internal(r: &crate::crawl::CrawlPageResult) -> Self {
432 let outcome = match r.status {
433 crate::crawl::CrawlStatus::Ok => Ok(CrawlPage {
434 title: r.title.clone(),
435 content: r.content.clone().unwrap_or_default(),
436 links_found: r.links_found,
437 }),
438 crate::crawl::CrawlStatus::Error => Err(CrawlError {
439 message: r.error.clone().unwrap_or_default(),
440 }),
441 };
442 Self {
443 url: r.url.clone(),
444 depth: r.depth,
445 outcome,
446 }
447 }
448}
449
450#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
452#[serde(rename_all = "lowercase")]
453#[non_exhaustive]
454pub enum CrawlStatus {
455 Ok,
457 Error,
459}
460
461#[allow(clippy::needless_pass_by_value)]
463pub fn crawl_each(opts: CrawlOptions, mut on_page: impl FnMut(&CrawlResult)) -> crate::error::Result<()> {
464 ensure_crypto_provider();
465 let internal_opts = build_crawl_options(&opts)?;
466 crate::runtime::block_on(crate::crawl::run(internal_opts, |r| {
467 on_page(&CrawlResult::from_internal(r));
468 }))
469 .map_err(|e| Error::Engine(e.to_string()))?;
470 Ok(())
471}
472
473#[allow(clippy::needless_pass_by_value)]
475pub fn crawl(opts: CrawlOptions) -> crate::error::Result<Vec<CrawlResult>> {
476 let mut results = Vec::new();
477 crawl_each(opts, |r| results.push(r.clone()))?;
478 Ok(results)
479}
480
481pub fn markdown(url: &str) -> crate::error::Result<String> {
483 fetch(FetchOptions::new(url))?.markdown_with_url(url)
484}
485
486pub fn extract_json(url: &str) -> crate::error::Result<String> {
488 fetch(FetchOptions::new(url))?.extract_json_with_url(url)
489}
490
491pub fn text(url: &str) -> crate::error::Result<String> {
493 Ok(fetch(FetchOptions::new(url))?.inner_text)
494}
495
496pub fn validate_url(url: &str) -> crate::error::Result<url::Url> {
498 crate::net::validate_url(url).map_err(|e| map_url_error(url, e))
499}
500
501fn ensure_crypto_provider() {
502 let _ = rustls::crypto::aws_lc_rs::default_provider().install_default();
503}
504
505pub(crate) fn sanitize_user_agent(ua: String) -> String {
507 if ua.bytes().any(|b| b == b'\r' || b == b'\n' || b == 0) {
508 ua.replace(['\r', '\n', '\0'], " ")
509 } else {
510 ua
511 }
512}
513
514fn map_url_error(url: &str, e: crate::net::UrlError) -> Error {
515 match e {
516 crate::net::UrlError::PrivateAddress(host) => Error::AddressNotAllowed(host),
517 crate::net::UrlError::Invalid(reason) => Error::InvalidUrl {
518 url: url.into(),
519 reason,
520 },
521 }
522}
523
524fn build_crawl_options(opts: &CrawlOptions) -> crate::error::Result<crate::crawl::CrawlOptions> {
525 let seed = crate::net::validate_url(&opts.url).map_err(|e| map_url_error(&opts.url, e))?;
526 let include = if opts.include.is_empty() {
527 None
528 } else {
529 Some(crate::crawl::build_globset(&opts.include).map_err(|e| Error::Engine(e.to_string()))?)
530 };
531 let exclude = if opts.exclude.is_empty() {
532 None
533 } else {
534 Some(crate::crawl::build_globset(&opts.exclude).map_err(|e| Error::Engine(e.to_string()))?)
535 };
536 Ok(crate::crawl::CrawlOptions {
537 seed,
538 limit: opts.limit,
539 max_depth: opts.max_depth,
540 timeout_secs: opts.timeout.as_secs().max(1),
541 settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
542 include,
543 exclude,
544 selector: opts.selector.clone(),
545 json: opts.json,
546 user_agent: opts.user_agent.clone(),
547 })
548}
549
550#[cfg(test)]
551mod tests {
552 use super::*;
553
554 #[test]
555 fn fetch_options_defaults() {
556 let opts = FetchOptions::new("https://example.com");
557 assert_eq!(opts.url, "https://example.com");
558 assert_eq!(opts.timeout, Duration::from_secs(30));
559 assert_eq!(opts.settle, Duration::ZERO);
560 assert!(matches!(opts.mode, FetchMode::Content));
561 }
562
563 #[test]
564 fn fetch_options_screenshot() {
565 let opts = FetchOptions::screenshot("https://example.com", true);
566 assert!(matches!(opts.mode, FetchMode::Screenshot { full_page: true }));
567 }
568
569 #[test]
570 fn fetch_options_javascript() {
571 let opts = FetchOptions::javascript("https://example.com", "document.title");
572 assert!(matches!(opts.mode, FetchMode::JavaScript(ref e) if e == "document.title"));
573 }
574
575 #[test]
576 fn fetch_options_chaining() {
577 let opts = FetchOptions::new("https://example.com")
578 .timeout(Duration::from_secs(60))
579 .settle(Duration::from_millis(500));
580 assert_eq!(opts.timeout, Duration::from_secs(60));
581 assert_eq!(opts.settle, Duration::from_millis(500));
582 }
583
584 #[test]
585 fn crawl_options_defaults() {
586 let opts = CrawlOptions::new("https://example.com");
587 assert_eq!(opts.url, "https://example.com");
588 assert_eq!(opts.limit, 50);
589 assert_eq!(opts.max_depth, 3);
590 assert_eq!(opts.timeout, Duration::from_secs(30));
591 assert!(opts.include.is_empty());
592 assert!(opts.exclude.is_empty());
593 }
594
595 #[test]
596 fn crawl_options_chaining() {
597 let opts = CrawlOptions::new("https://example.com")
598 .limit(100)
599 .max_depth(5)
600 .timeout(Duration::from_secs(60))
601 .include(&["/docs/**"])
602 .exclude(&["/docs/archive/**"]);
603 assert_eq!(opts.limit, 100);
604 assert_eq!(opts.max_depth, 5);
605 assert_eq!(opts.include, vec!["/docs/**"]);
606 assert_eq!(opts.exclude, vec!["/docs/archive/**"]);
607 }
608
609 #[test]
610 fn fetch_user_agent_set() {
611 let opts = FetchOptions::new("https://example.com").user_agent("MyBot/1.0");
612 assert_eq!(opts.user_agent.as_deref(), Some("MyBot/1.0"));
613 }
614
615 #[test]
616 fn fetch_user_agent_default_is_none() {
617 let opts = FetchOptions::new("https://example.com");
618 assert!(opts.user_agent.is_none());
619 }
620
621 #[test]
622 fn fetch_user_agent_sanitizes_crlf() {
623 let opts = FetchOptions::new("https://example.com").user_agent("Bot\r\nX-Evil: yes");
624 assert_eq!(opts.user_agent.as_deref(), Some("Bot X-Evil: yes"));
625 }
626
627 #[test]
628 fn fetch_user_agent_sanitizes_null() {
629 let opts = FetchOptions::new("https://example.com").user_agent("Bot\0/1.0");
630 assert_eq!(opts.user_agent.as_deref(), Some("Bot /1.0"));
631 }
632
633 #[test]
634 fn fetch_user_agent_empty_string() {
635 let opts = FetchOptions::new("https://example.com").user_agent("");
636 assert_eq!(opts.user_agent.as_deref(), Some(""));
637 }
638
639 #[test]
640 fn crawl_user_agent_sanitizes_crlf() {
641 let opts = CrawlOptions::new("https://example.com").user_agent("Crawler\r\n/2.0");
642 assert_eq!(opts.user_agent.as_deref(), Some("Crawler /2.0"));
643 }
644
645 #[test]
646 fn page_markdown_from_html() {
647 let page = Page {
648 html: "<html><head><title>Test</title></head><body><p>hello world</p></body></html>".into(),
649 inner_text: "hello world".into(),
650 ..Page::default()
651 };
652 let md = page.markdown().unwrap();
653 assert!(md.contains("hello world"));
654 }
655
656 #[test]
657 fn page_extract_json_produces_valid_json() {
658 let page = Page {
659 html: "<html><head><title>Test</title></head><body><p>content</p></body></html>".into(),
660 inner_text: "content".into(),
661 ..Page::default()
662 };
663 let json = page.extract_json().unwrap();
664 let _: serde_json::Value = serde_json::from_str(&json).expect("valid JSON");
665 }
666
667 #[test]
668 fn page_screenshot_png_none_by_default() {
669 let page = Page::default();
670 assert!(page.screenshot_png().is_none());
671 }
672
673 #[test]
674 fn fetch_rejects_invalid_url() {
675 let result = fetch(FetchOptions::new("not a url"));
676 assert!(result.is_err());
677 let err = result.unwrap_err();
678 assert!(matches!(err, Error::InvalidUrl { .. }));
679 }
680
681 #[test]
682 fn fetch_rejects_private_ip() {
683 let result = fetch(FetchOptions::new("http://127.0.0.1/"));
684 assert!(result.is_err());
685 }
686
687 #[test]
688 fn fetch_rejects_file_scheme() {
689 let result = fetch(FetchOptions::new("file:///etc/passwd"));
690 assert!(result.is_err());
691 }
692}