1use std::collections::HashMap;
4use std::sync::Arc;
5use std::time::Duration;
6
7use servo::accesskit::{Node, NodeId};
8
9use crate::error::Error;
10use crate::net::sanitize_user_agent;
11
12#[derive(Debug, Clone, Default, serde::Serialize)]
14#[non_exhaustive]
15pub struct Page {
16 pub html: String,
18 pub inner_text: String,
20 pub title: Option<String>,
22 #[serde(skip_serializing_if = "Option::is_none")]
24 pub layout_json: Option<String>,
25 #[serde(skip)]
27 visibility_json: Option<String>,
28 #[serde(skip_serializing_if = "Option::is_none")]
30 pub js_result: Option<String>,
31 pub console_messages: Vec<ConsoleMessage>,
33 #[serde(skip_serializing_if = "Option::is_none")]
35 pub accessibility_tree: Option<String>,
36 #[serde(skip_serializing_if = "Option::is_none")]
38 pub extracted: Option<serde_json::Value>,
39 #[serde(skip)]
41 screenshot_png: Option<Vec<u8>>,
42 #[serde(skip)]
44 a11y: Option<Arc<HashMap<NodeId, Node>>>,
45 #[serde(skip)]
47 visibility_policy: crate::visibility::VisibilityPolicy,
48}
49
50impl Page {
51 pub fn markdown(&self) -> crate::error::Result<String> {
53 self.markdown_with_url("")
54 }
55
56 pub fn markdown_with_url(&self, url: &str) -> crate::error::Result<String> {
58 Ok(crate::extract::extract_text(&self.extract_input(url, None))?)
59 }
60
61 pub fn extract_json(&self) -> crate::error::Result<String> {
63 self.extract_json_with_url("")
64 }
65
66 pub fn extract_json_with_url(&self, url: &str) -> crate::error::Result<String> {
68 Ok(crate::extract::extract_json(&self.extract_input(url, None))?)
69 }
70
71 pub fn markdown_with_selector(&self, url: &str, selector: &str) -> crate::error::Result<String> {
73 Ok(crate::extract::extract_text(&self.extract_input(url, Some(selector)))?)
74 }
75
76 pub fn extract_json_with_selector(&self, url: &str, selector: &str) -> crate::error::Result<String> {
78 Ok(crate::extract::extract_json(&self.extract_input(url, Some(selector)))?)
79 }
80
81 #[must_use]
83 pub fn screenshot_png(&self) -> Option<&[u8]> {
84 self.screenshot_png.as_deref()
85 }
86
87 fn extract_input<'a>(&'a self, url: &'a str, selector: Option<&'a str>) -> crate::extract::ExtractInput<'a> {
88 crate::extract::ExtractInput::new(&self.html, url)
89 .with_layout_json(self.layout_json.as_deref())
90 .with_visibility_json(self.visibility_json.as_deref())
91 .with_a11y(self.a11y.as_deref())
92 .with_inner_text(Some(&self.inner_text))
93 .with_selector(selector)
94 .with_visibility(self.visibility_policy)
95 }
96
97 pub(crate) fn from_servo(page: crate::bridge::ServoPage) -> Self {
98 let title = {
99 let doc = dom_query::Document::from(page.html.as_str());
100 let t = doc.select("title").text().to_string();
101 if t.is_empty() { None } else { Some(t) }
102 };
103 let screenshot_png = page.screenshot.and_then(|img| {
104 let mut buf = std::io::Cursor::new(Vec::new());
105 img.write_to(&mut buf, image::ImageFormat::Png).ok()?;
106 Some(buf.into_inner())
107 });
108 Self {
109 html: page.html,
110 inner_text: page.inner_text.unwrap_or_default(),
111 title,
112 layout_json: page.layout_json,
113 visibility_json: page.visibility_json,
114 js_result: page.js_result,
115 console_messages: page
116 .console_messages
117 .into_iter()
118 .map(|m| ConsoleMessage {
119 level: match m.level {
120 crate::bridge::ConsoleLevel::Log => ConsoleLevel::Log,
121 crate::bridge::ConsoleLevel::Debug => ConsoleLevel::Debug,
122 crate::bridge::ConsoleLevel::Info => ConsoleLevel::Info,
123 crate::bridge::ConsoleLevel::Warn => ConsoleLevel::Warn,
124 crate::bridge::ConsoleLevel::Error => ConsoleLevel::Error,
125 crate::bridge::ConsoleLevel::Trace => ConsoleLevel::Trace,
126 },
127 message: m.message,
128 })
129 .collect(),
130 screenshot_png,
131 accessibility_tree: page.accessibility_tree,
132 a11y: page.a11y.map(Arc::new),
133 extracted: None,
134 visibility_policy: crate::visibility::VisibilityPolicy::default(),
135 }
136 }
137}
138
139#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
141#[non_exhaustive]
142pub struct ConsoleMessage {
143 pub level: ConsoleLevel,
145 pub message: String,
147}
148
149#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
151#[serde(rename_all = "lowercase")]
152#[non_exhaustive]
153pub enum ConsoleLevel {
154 Log,
156 Debug,
158 Info,
160 Warn,
162 Error,
164 Trace,
166}
167
168impl ConsoleLevel {
169 #[must_use]
171 pub fn as_str(&self) -> &'static str {
172 match self {
173 Self::Log => "log",
174 Self::Debug => "debug",
175 Self::Info => "info",
176 Self::Warn => "warn",
177 Self::Error => "error",
178 Self::Trace => "trace",
179 }
180 }
181}
182
183impl std::fmt::Display for ConsoleLevel {
184 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
185 f.pad(self.as_str())
186 }
187}
188
189#[derive(Debug, Clone, Default)]
190pub(crate) enum FetchMode {
191 #[default]
192 Content,
193 Screenshot {
194 full_page: bool,
195 },
196 JavaScript(String),
197}
198
199#[must_use = "options do nothing until passed to fetch()"]
201#[derive(Debug, Clone)]
202pub struct FetchOptions {
203 pub(crate) url: String,
204 pub(crate) timeout: Duration,
205 pub(crate) settle: Duration,
206 pub(crate) mode: FetchMode,
207 pub(crate) user_agent: Option<String>,
208 pub(crate) extract_schema: Option<crate::schema::ExtractSchema>,
209 pub(crate) visibility: crate::visibility::VisibilityPolicy,
210}
211
212impl FetchOptions {
213 pub fn new(url: &str) -> Self {
215 Self {
216 url: url.into(),
217 timeout: Duration::from_secs(30),
218 settle: Duration::ZERO,
219 mode: FetchMode::Content,
220 user_agent: None,
221 extract_schema: None,
222 visibility: crate::visibility::VisibilityPolicy::default(),
223 }
224 }
225
226 pub fn screenshot(url: &str, full_page: bool) -> Self {
228 Self {
229 mode: FetchMode::Screenshot { full_page },
230 ..Self::new(url)
231 }
232 }
233
234 pub fn javascript(url: &str, expression: impl Into<String>) -> Self {
236 Self {
237 mode: FetchMode::JavaScript(expression.into()),
238 ..Self::new(url)
239 }
240 }
241
242 pub fn timeout(mut self, timeout: Duration) -> Self {
244 self.timeout = timeout;
245 self
246 }
247
248 pub fn settle(mut self, settle: Duration) -> Self {
250 self.settle = settle;
251 self
252 }
253
254 pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
256 self.user_agent = Some(sanitize_user_agent(ua.into()));
257 self
258 }
259
260 pub fn schema(mut self, schema: crate::schema::ExtractSchema) -> Self {
262 self.extract_schema = Some(schema);
263 self
264 }
265
266 pub fn visibility(mut self, policy: crate::visibility::VisibilityPolicy) -> Self {
268 self.visibility = policy;
269 self
270 }
271}
272
273#[allow(clippy::needless_pass_by_value)]
275pub fn fetch(opts: FetchOptions) -> crate::error::Result<Page> {
276 crate::net::ensure_crypto_provider();
277
278 crate::net::validate_url(&opts.url)?;
279
280 if matches!(opts.mode, FetchMode::Content)
281 && let Some(bytes) = crate::pdf::probe(&opts.url, opts.timeout.as_secs().max(1))
282 {
283 let text = crate::extract::extract_pdf(&bytes);
284 return Ok(Page {
285 html: String::new(),
286 inner_text: text,
287 ..Page::default()
288 });
289 }
290
291 let bridge_opts = crate::bridge::FetchOptions {
292 url: &opts.url,
293 timeout_secs: opts.timeout.as_secs().max(1),
294 settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
295 user_agent: opts.user_agent.as_deref(),
296 mode: match opts.mode {
297 FetchMode::Content => crate::bridge::FetchMode::Content { include_a11y: false },
298 FetchMode::Screenshot { full_page } => crate::bridge::FetchMode::Screenshot { full_page },
299 FetchMode::JavaScript(ref expr) => crate::bridge::FetchMode::ExecuteJs {
300 expression: expr.clone(),
301 },
302 },
303 };
304
305 let servo_page = crate::bridge::fetch_page(bridge_opts).map_err(|e| {
306 let msg = format!("{e:#}");
307 if msg.contains("timed out") {
308 Error::Timeout {
309 url: opts.url.clone(),
310 timeout: opts.timeout,
311 }
312 } else {
313 Error::Engine(msg)
314 }
315 })?;
316
317 let mut page = Page::from_servo(servo_page);
318 page.visibility_policy = opts.visibility;
319 if let Some(schema) = opts.extract_schema.as_ref() {
320 page.extracted = Some(schema.extract_from(&page.html));
321 }
322 Ok(page)
323}
324
325pub fn markdown(url: &str) -> crate::error::Result<String> {
327 fetch(FetchOptions::new(url))?.markdown_with_url(url)
328}
329
330pub fn extract_json(url: &str) -> crate::error::Result<String> {
332 fetch(FetchOptions::new(url))?.extract_json_with_url(url)
333}
334
335pub fn text(url: &str) -> crate::error::Result<String> {
337 Ok(fetch(FetchOptions::new(url))?.inner_text)
338}
339
340#[cfg(test)]
341mod tests {
342 use super::*;
343
344 #[test]
345 fn fetch_options_defaults() {
346 let opts = FetchOptions::new("https://example.com");
347 assert_eq!(opts.url, "https://example.com");
348 assert_eq!(opts.timeout, Duration::from_secs(30));
349 assert_eq!(opts.settle, Duration::ZERO);
350 assert!(matches!(opts.mode, FetchMode::Content));
351 }
352
353 #[test]
354 fn fetch_options_screenshot() {
355 let opts = FetchOptions::screenshot("https://example.com", true);
356 assert!(matches!(opts.mode, FetchMode::Screenshot { full_page: true }));
357 }
358
359 #[test]
360 fn fetch_options_javascript() {
361 let opts = FetchOptions::javascript("https://example.com", "document.title");
362 assert!(matches!(opts.mode, FetchMode::JavaScript(ref e) if e == "document.title"));
363 }
364
365 #[test]
366 fn fetch_options_chaining() {
367 let opts = FetchOptions::new("https://example.com")
368 .timeout(Duration::from_secs(60))
369 .settle(Duration::from_millis(500));
370 assert_eq!(opts.timeout, Duration::from_secs(60));
371 assert_eq!(opts.settle, Duration::from_millis(500));
372 }
373
374 #[test]
375 fn fetch_user_agent_set() {
376 let opts = FetchOptions::new("https://example.com").user_agent("MyBot/1.0");
377 assert_eq!(opts.user_agent.as_deref(), Some("MyBot/1.0"));
378 }
379
380 #[test]
381 fn fetch_user_agent_default_is_none() {
382 let opts = FetchOptions::new("https://example.com");
383 assert!(opts.user_agent.is_none());
384 }
385
386 #[test]
387 fn fetch_user_agent_sanitizes_crlf() {
388 let opts = FetchOptions::new("https://example.com").user_agent("Bot\r\nX-Evil: yes");
389 assert_eq!(opts.user_agent.as_deref(), Some("Bot X-Evil: yes"));
390 }
391
392 #[test]
393 fn fetch_user_agent_sanitizes_null() {
394 let opts = FetchOptions::new("https://example.com").user_agent("Bot\0/1.0");
395 assert_eq!(opts.user_agent.as_deref(), Some("Bot /1.0"));
396 }
397
398 #[test]
399 fn fetch_user_agent_empty_string() {
400 let opts = FetchOptions::new("https://example.com").user_agent("");
401 assert_eq!(opts.user_agent.as_deref(), Some(""));
402 }
403
404 #[test]
405 fn page_markdown_from_html() {
406 let page = Page {
407 html: "<html><head><title>Test</title></head><body><p>hello world</p></body></html>".into(),
408 inner_text: "hello world".into(),
409 ..Page::default()
410 };
411 let md = page.markdown().unwrap();
412 assert!(md.contains("hello world"));
413 }
414
415 #[test]
416 fn page_extract_json_produces_valid_json() {
417 let page = Page {
418 html: "<html><head><title>Test</title></head><body><p>content</p></body></html>".into(),
419 inner_text: "content".into(),
420 ..Page::default()
421 };
422 let json = page.extract_json().unwrap();
423 let _: serde_json::Value = serde_json::from_str(&json).expect("valid JSON");
424 }
425
426 #[test]
427 fn page_screenshot_png_none_by_default() {
428 let page = Page::default();
429 assert!(page.screenshot_png().is_none());
430 }
431
432 #[test]
433 fn page_markdown_with_selector_scopes_to_subtree() {
434 let page = Page {
435 html: "<html><body><article>keep</article><aside>drop</aside></body></html>".into(),
436 ..Page::default()
437 };
438 let md = page.markdown_with_selector("https://example.com", "article").unwrap();
439 assert!(md.contains("keep"));
440 assert!(!md.contains("drop"));
441 }
442
443 #[test]
444 fn page_extract_json_with_selector_includes_url() {
445 let page = Page {
446 html: "<html><body><article>scoped</article></body></html>".into(),
447 ..Page::default()
448 };
449 let json = page
450 .extract_json_with_selector("https://example.com/page", "article")
451 .unwrap();
452 let parsed: serde_json::Value = serde_json::from_str(&json).expect("valid JSON");
453 assert_eq!(parsed["url"].as_str(), Some("https://example.com/page"));
454 assert!(parsed["text_content"].as_str().unwrap().contains("scoped"));
455 }
456
457 #[test]
458 fn page_markdown_with_selector_no_match_returns_empty() {
459 let page = Page {
460 html: "<html><body><article>x</article></body></html>".into(),
461 ..Page::default()
462 };
463 let md = page.markdown_with_selector("", ".nonexistent").unwrap();
464 assert!(md.is_empty());
465 }
466
467 #[test]
468 fn page_markdown_with_invalid_selector_returns_error() {
469 let page = Page {
470 html: "<html><body><p>x</p></body></html>".into(),
471 ..Page::default()
472 };
473 let err = page.markdown_with_selector("", "###invalid[[[").unwrap_err();
474 assert!(err.to_string().contains("invalid CSS selector"));
475 }
476
477 #[test]
478 fn page_markdown_with_empty_selector_returns_error() {
479 let page = Page {
480 html: "<html><body><p>x</p></body></html>".into(),
481 ..Page::default()
482 };
483 assert!(page.markdown_with_selector("", "").is_err());
484 }
485
486 #[test]
487 fn fetch_rejects_invalid_url() {
488 let result = fetch(FetchOptions::new("not a url"));
489 assert!(result.is_err());
490 let err = result.unwrap_err();
491 assert!(matches!(err, Error::InvalidUrl { .. }));
492 }
493
494 #[test]
495 fn fetch_rejects_private_ip() {
496 let result = fetch(FetchOptions::new("http://127.0.0.1/"));
497 assert!(result.is_err());
498 }
499
500 #[test]
501 fn fetch_rejects_file_scheme() {
502 let result = fetch(FetchOptions::new("file:///etc/passwd"));
503 assert!(result.is_err());
504 }
505
506 mod page_from_servo {
507 use crate::bridge;
508 use crate::fetch::{ConsoleLevel, Page};
509
510 fn synthetic_image(w: u32, h: u32) -> image::RgbaImage {
511 image::RgbaImage::from_pixel(w, h, image::Rgba([255, 0, 0, 255]))
512 }
513
514 fn empty_servo_page() -> bridge::ServoPage {
515 bridge::ServoPage::default()
516 }
517
518 #[test]
519 fn extracts_title_from_html() {
520 let mut sp = empty_servo_page();
521 sp.html = "<html><head><title>Hello World</title></head></html>".into();
522 let page = Page::from_servo(sp);
523 assert_eq!(page.title.as_deref(), Some("Hello World"));
524 }
525
526 #[test]
527 fn title_is_none_when_tag_missing() {
528 let mut sp = empty_servo_page();
529 sp.html = "<html><body>no title here</body></html>".into();
530 let page = Page::from_servo(sp);
531 assert!(page.title.is_none());
532 }
533
534 #[test]
535 fn title_is_none_when_tag_empty() {
536 let mut sp = empty_servo_page();
537 sp.html = "<html><head><title></title></head></html>".into();
538 let page = Page::from_servo(sp);
539 assert!(page.title.is_none());
540 }
541
542 #[test]
543 fn title_is_none_for_empty_html() {
544 let page = Page::from_servo(empty_servo_page());
545 assert!(page.title.is_none());
546 }
547
548 #[test]
549 fn inner_text_none_becomes_empty_string() {
550 let sp = empty_servo_page();
551 assert!(sp.inner_text.is_none());
552 let page = Page::from_servo(sp);
553 assert_eq!(page.inner_text, "");
554 }
555
556 #[test]
557 fn screenshot_is_encoded_as_png() {
558 let mut sp = empty_servo_page();
559 sp.screenshot = Some(synthetic_image(8, 8));
560 let page = Page::from_servo(sp);
561 let bytes = page.screenshot_png().expect("screenshot encoded");
562 assert_eq!(&bytes[..8], b"\x89PNG\r\n\x1a\n", "PNG magic bytes");
563 }
564
565 #[test]
566 fn console_messages_empty_by_default() {
567 let page = Page::from_servo(empty_servo_page());
568 assert!(page.console_messages.is_empty());
569 }
570
571 #[test]
572 fn console_messages_preserve_all_six_levels() {
573 let cases = [
574 (bridge::ConsoleLevel::Log, ConsoleLevel::Log),
575 (bridge::ConsoleLevel::Debug, ConsoleLevel::Debug),
576 (bridge::ConsoleLevel::Info, ConsoleLevel::Info),
577 (bridge::ConsoleLevel::Warn, ConsoleLevel::Warn),
578 (bridge::ConsoleLevel::Error, ConsoleLevel::Error),
579 (bridge::ConsoleLevel::Trace, ConsoleLevel::Trace),
580 ];
581 for (src, expected) in cases {
582 let mut sp = empty_servo_page();
583 sp.console_messages = vec![bridge::ConsoleMessage {
584 level: src,
585 message: "msg".into(),
586 }];
587 let page = Page::from_servo(sp);
588 assert_eq!(
589 page.console_messages.len(),
590 1,
591 "console message lost for source level {src:?}",
592 );
593 assert_eq!(
594 page.console_messages[0].level, expected,
595 "level mapping wrong for source {src:?}",
596 );
597 }
598 }
599
600 #[test]
601 fn console_messages_preserve_ordering_across_levels() {
602 let mut sp = empty_servo_page();
603 sp.console_messages = vec![
604 bridge::ConsoleMessage {
605 level: bridge::ConsoleLevel::Info,
606 message: "first".into(),
607 },
608 bridge::ConsoleMessage {
609 level: bridge::ConsoleLevel::Error,
610 message: "second".into(),
611 },
612 bridge::ConsoleMessage {
613 level: bridge::ConsoleLevel::Warn,
614 message: "third".into(),
615 },
616 ];
617 let page = Page::from_servo(sp);
618 assert_eq!(page.console_messages.len(), 3);
619 assert_eq!(page.console_messages[0].message, "first");
620 assert_eq!(page.console_messages[1].message, "second");
621 assert_eq!(page.console_messages[2].message, "third");
622 assert_eq!(page.console_messages[0].level, ConsoleLevel::Info);
623 assert_eq!(page.console_messages[1].level, ConsoleLevel::Error);
624 assert_eq!(page.console_messages[2].level, ConsoleLevel::Warn);
625 }
626
627 #[test]
628 fn extracted_starts_as_none_until_schema_applied() {
629 let page = Page::from_servo(empty_servo_page());
630 assert!(page.extracted.is_none());
631 }
632
633 #[test]
634 fn full_round_trip_preserves_every_field() {
635 let sp = bridge::ServoPage {
636 html: "<html><head><title>T</title></head><body>B</body></html>".into(),
637 inner_text: Some("B".into()),
638 layout_json: Some("[]".into()),
639 visibility_json: Some("[]".into()),
640 screenshot: Some(synthetic_image(2, 2)),
641 js_result: Some("42".into()),
642 accessibility_tree: Some("{}".into()),
643 a11y: None,
644 console_messages: vec![bridge::ConsoleMessage {
645 level: bridge::ConsoleLevel::Log,
646 message: "x".into(),
647 }],
648 };
649 let page = Page::from_servo(sp);
650 assert_eq!(page.html, "<html><head><title>T</title></head><body>B</body></html>");
651 assert_eq!(page.inner_text, "B");
652 assert_eq!(page.title.as_deref(), Some("T"));
653 assert_eq!(page.layout_json.as_deref(), Some("[]"));
654 assert_eq!(page.js_result.as_deref(), Some("42"));
655 assert_eq!(page.accessibility_tree.as_deref(), Some("{}"));
656 assert_eq!(page.console_messages.len(), 1);
657 assert!(page.screenshot_png().is_some());
658 assert!(page.extracted.is_none());
659 }
660 }
661}