1use std::time::Duration;
4
5use crate::error::Error;
6use crate::net::sanitize_user_agent;
7
8#[derive(Debug, Clone, Default, serde::Serialize)]
10#[non_exhaustive]
11pub struct Page {
12 pub html: String,
14 pub inner_text: String,
16 pub title: Option<String>,
18 #[serde(skip_serializing_if = "Option::is_none")]
20 pub layout_json: Option<String>,
21 #[serde(skip_serializing_if = "Option::is_none")]
23 pub js_result: Option<String>,
24 pub console_messages: Vec<ConsoleMessage>,
26 #[serde(skip_serializing_if = "Option::is_none")]
28 pub accessibility_tree: Option<String>,
29 #[serde(skip_serializing_if = "Option::is_none")]
31 pub extracted: Option<serde_json::Value>,
32 #[serde(skip)]
33 screenshot_png: Option<Vec<u8>>,
34}
35
36impl Page {
37 pub fn markdown(&self) -> crate::error::Result<String> {
39 self.markdown_with_url("")
40 }
41
42 pub fn markdown_with_url(&self, url: &str) -> crate::error::Result<String> {
44 let input = crate::extract::ExtractInput::new(&self.html, url)
45 .with_layout_json(self.layout_json.as_deref())
46 .with_inner_text(Some(&self.inner_text));
47 Ok(crate::extract::extract_text(&input)?)
48 }
49
50 pub fn extract_json(&self) -> crate::error::Result<String> {
52 self.extract_json_with_url("")
53 }
54
55 pub fn extract_json_with_url(&self, url: &str) -> crate::error::Result<String> {
57 let input = crate::extract::ExtractInput::new(&self.html, url)
58 .with_layout_json(self.layout_json.as_deref())
59 .with_inner_text(Some(&self.inner_text));
60 Ok(crate::extract::extract_json(&input)?)
61 }
62
63 pub fn markdown_with_selector(&self, url: &str, selector: &str) -> crate::error::Result<String> {
65 let input = crate::extract::ExtractInput::new(&self.html, url)
66 .with_layout_json(self.layout_json.as_deref())
67 .with_inner_text(Some(&self.inner_text))
68 .with_selector(Some(selector));
69 Ok(crate::extract::extract_text(&input)?)
70 }
71
72 pub fn extract_json_with_selector(&self, url: &str, selector: &str) -> crate::error::Result<String> {
74 let input = crate::extract::ExtractInput::new(&self.html, url)
75 .with_layout_json(self.layout_json.as_deref())
76 .with_inner_text(Some(&self.inner_text))
77 .with_selector(Some(selector));
78 Ok(crate::extract::extract_json(&input)?)
79 }
80
81 #[must_use]
83 pub fn screenshot_png(&self) -> Option<&[u8]> {
84 self.screenshot_png.as_deref()
85 }
86
87 pub(crate) fn from_servo(page: crate::bridge::ServoPage) -> Self {
88 let title = {
89 let doc = dom_query::Document::from(page.html.as_str());
90 let t = doc.select("title").text().to_string();
91 if t.is_empty() { None } else { Some(t) }
92 };
93 let screenshot_png = page.screenshot.and_then(|img| {
94 let mut buf = std::io::Cursor::new(Vec::new());
95 img.write_to(&mut buf, image::ImageFormat::Png).ok()?;
96 Some(buf.into_inner())
97 });
98 Self {
99 html: page.html,
100 inner_text: page.inner_text.unwrap_or_default(),
101 title,
102 layout_json: page.layout_json,
103 js_result: page.js_result,
104 console_messages: page
105 .console_messages
106 .into_iter()
107 .map(|m| ConsoleMessage {
108 level: match m.level {
109 crate::bridge::ConsoleLevel::Log => ConsoleLevel::Log,
110 crate::bridge::ConsoleLevel::Debug => ConsoleLevel::Debug,
111 crate::bridge::ConsoleLevel::Info => ConsoleLevel::Info,
112 crate::bridge::ConsoleLevel::Warn => ConsoleLevel::Warn,
113 crate::bridge::ConsoleLevel::Error => ConsoleLevel::Error,
114 crate::bridge::ConsoleLevel::Trace => ConsoleLevel::Trace,
115 },
116 message: m.message,
117 })
118 .collect(),
119 screenshot_png,
120 accessibility_tree: page.accessibility_tree,
121 extracted: None,
122 }
123 }
124}
125
126#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
128#[non_exhaustive]
129pub struct ConsoleMessage {
130 pub level: ConsoleLevel,
132 pub message: String,
134}
135
136#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
138#[serde(rename_all = "lowercase")]
139#[non_exhaustive]
140pub enum ConsoleLevel {
141 Log,
143 Debug,
145 Info,
147 Warn,
149 Error,
151 Trace,
153}
154
155impl ConsoleLevel {
156 #[must_use]
158 pub fn as_str(&self) -> &'static str {
159 match self {
160 Self::Log => "log",
161 Self::Debug => "debug",
162 Self::Info => "info",
163 Self::Warn => "warn",
164 Self::Error => "error",
165 Self::Trace => "trace",
166 }
167 }
168}
169
170impl std::fmt::Display for ConsoleLevel {
171 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
172 f.pad(self.as_str())
173 }
174}
175
176#[derive(Debug, Clone, Default)]
177pub(crate) enum FetchMode {
178 #[default]
179 Content,
180 Screenshot {
181 full_page: bool,
182 },
183 JavaScript(String),
184}
185
186#[must_use = "options do nothing until passed to fetch()"]
188#[derive(Debug, Clone)]
189pub struct FetchOptions {
190 pub(crate) url: String,
191 pub(crate) timeout: Duration,
192 pub(crate) settle: Duration,
193 pub(crate) mode: FetchMode,
194 pub(crate) user_agent: Option<String>,
195 pub(crate) extract_schema: Option<crate::schema::ExtractSchema>,
196}
197
198impl FetchOptions {
199 pub fn new(url: &str) -> Self {
201 Self {
202 url: url.into(),
203 timeout: Duration::from_secs(30),
204 settle: Duration::ZERO,
205 mode: FetchMode::Content,
206 user_agent: None,
207 extract_schema: None,
208 }
209 }
210
211 pub fn screenshot(url: &str, full_page: bool) -> Self {
213 Self {
214 mode: FetchMode::Screenshot { full_page },
215 ..Self::new(url)
216 }
217 }
218
219 pub fn javascript(url: &str, expression: impl Into<String>) -> Self {
221 Self {
222 mode: FetchMode::JavaScript(expression.into()),
223 ..Self::new(url)
224 }
225 }
226
227 pub fn timeout(mut self, timeout: Duration) -> Self {
229 self.timeout = timeout;
230 self
231 }
232
233 pub fn settle(mut self, settle: Duration) -> Self {
235 self.settle = settle;
236 self
237 }
238
239 pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
241 self.user_agent = Some(sanitize_user_agent(ua.into()));
242 self
243 }
244
245 pub fn schema(mut self, schema: crate::schema::ExtractSchema) -> Self {
247 self.extract_schema = Some(schema);
248 self
249 }
250}
251
252#[allow(clippy::needless_pass_by_value)]
254pub fn fetch(opts: FetchOptions) -> crate::error::Result<Page> {
255 crate::net::ensure_crypto_provider();
256
257 crate::net::validate_url(&opts.url)?;
258
259 if matches!(opts.mode, FetchMode::Content)
260 && let Some(bytes) = crate::pdf::probe(&opts.url, opts.timeout.as_secs().max(1))
261 {
262 let text = crate::extract::extract_pdf(&bytes);
263 return Ok(Page {
264 html: String::new(),
265 inner_text: text,
266 ..Page::default()
267 });
268 }
269
270 let bridge_opts = crate::bridge::FetchOptions {
271 url: &opts.url,
272 timeout_secs: opts.timeout.as_secs().max(1),
273 settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
274 user_agent: opts.user_agent.as_deref(),
275 mode: match opts.mode {
276 FetchMode::Content => crate::bridge::FetchMode::Content { include_a11y: false },
277 FetchMode::Screenshot { full_page } => crate::bridge::FetchMode::Screenshot { full_page },
278 FetchMode::JavaScript(ref expr) => crate::bridge::FetchMode::ExecuteJs {
279 expression: expr.clone(),
280 },
281 },
282 };
283
284 let servo_page = crate::bridge::fetch_page(bridge_opts).map_err(|e| {
285 let msg = format!("{e:#}");
286 if msg.contains("timed out") {
287 Error::Timeout {
288 url: opts.url.clone(),
289 timeout: opts.timeout,
290 }
291 } else {
292 Error::Engine(msg)
293 }
294 })?;
295
296 let mut page = Page::from_servo(servo_page);
297 if let Some(schema) = opts.extract_schema.as_ref() {
298 page.extracted = Some(schema.extract_from(&page.html));
299 }
300 Ok(page)
301}
302
303pub fn markdown(url: &str) -> crate::error::Result<String> {
305 fetch(FetchOptions::new(url))?.markdown_with_url(url)
306}
307
308pub fn extract_json(url: &str) -> crate::error::Result<String> {
310 fetch(FetchOptions::new(url))?.extract_json_with_url(url)
311}
312
313pub fn text(url: &str) -> crate::error::Result<String> {
315 Ok(fetch(FetchOptions::new(url))?.inner_text)
316}
317
318#[cfg(test)]
319mod tests {
320 use super::*;
321
322 #[test]
323 fn fetch_options_defaults() {
324 let opts = FetchOptions::new("https://example.com");
325 assert_eq!(opts.url, "https://example.com");
326 assert_eq!(opts.timeout, Duration::from_secs(30));
327 assert_eq!(opts.settle, Duration::ZERO);
328 assert!(matches!(opts.mode, FetchMode::Content));
329 }
330
331 #[test]
332 fn fetch_options_screenshot() {
333 let opts = FetchOptions::screenshot("https://example.com", true);
334 assert!(matches!(opts.mode, FetchMode::Screenshot { full_page: true }));
335 }
336
337 #[test]
338 fn fetch_options_javascript() {
339 let opts = FetchOptions::javascript("https://example.com", "document.title");
340 assert!(matches!(opts.mode, FetchMode::JavaScript(ref e) if e == "document.title"));
341 }
342
343 #[test]
344 fn fetch_options_chaining() {
345 let opts = FetchOptions::new("https://example.com")
346 .timeout(Duration::from_secs(60))
347 .settle(Duration::from_millis(500));
348 assert_eq!(opts.timeout, Duration::from_secs(60));
349 assert_eq!(opts.settle, Duration::from_millis(500));
350 }
351
352 #[test]
353 fn fetch_user_agent_set() {
354 let opts = FetchOptions::new("https://example.com").user_agent("MyBot/1.0");
355 assert_eq!(opts.user_agent.as_deref(), Some("MyBot/1.0"));
356 }
357
358 #[test]
359 fn fetch_user_agent_default_is_none() {
360 let opts = FetchOptions::new("https://example.com");
361 assert!(opts.user_agent.is_none());
362 }
363
364 #[test]
365 fn fetch_user_agent_sanitizes_crlf() {
366 let opts = FetchOptions::new("https://example.com").user_agent("Bot\r\nX-Evil: yes");
367 assert_eq!(opts.user_agent.as_deref(), Some("Bot X-Evil: yes"));
368 }
369
370 #[test]
371 fn fetch_user_agent_sanitizes_null() {
372 let opts = FetchOptions::new("https://example.com").user_agent("Bot\0/1.0");
373 assert_eq!(opts.user_agent.as_deref(), Some("Bot /1.0"));
374 }
375
376 #[test]
377 fn fetch_user_agent_empty_string() {
378 let opts = FetchOptions::new("https://example.com").user_agent("");
379 assert_eq!(opts.user_agent.as_deref(), Some(""));
380 }
381
382 #[test]
383 fn page_markdown_from_html() {
384 let page = Page {
385 html: "<html><head><title>Test</title></head><body><p>hello world</p></body></html>".into(),
386 inner_text: "hello world".into(),
387 ..Page::default()
388 };
389 let md = page.markdown().unwrap();
390 assert!(md.contains("hello world"));
391 }
392
393 #[test]
394 fn page_extract_json_produces_valid_json() {
395 let page = Page {
396 html: "<html><head><title>Test</title></head><body><p>content</p></body></html>".into(),
397 inner_text: "content".into(),
398 ..Page::default()
399 };
400 let json = page.extract_json().unwrap();
401 let _: serde_json::Value = serde_json::from_str(&json).expect("valid JSON");
402 }
403
404 #[test]
405 fn page_screenshot_png_none_by_default() {
406 let page = Page::default();
407 assert!(page.screenshot_png().is_none());
408 }
409
410 #[test]
411 fn page_markdown_with_selector_scopes_to_subtree() {
412 let page = Page {
413 html: "<html><body><article>keep</article><aside>drop</aside></body></html>".into(),
414 ..Page::default()
415 };
416 let md = page.markdown_with_selector("https://example.com", "article").unwrap();
417 assert!(md.contains("keep"));
418 assert!(!md.contains("drop"));
419 }
420
421 #[test]
422 fn page_extract_json_with_selector_includes_url() {
423 let page = Page {
424 html: "<html><body><article>scoped</article></body></html>".into(),
425 ..Page::default()
426 };
427 let json = page
428 .extract_json_with_selector("https://example.com/page", "article")
429 .unwrap();
430 let parsed: serde_json::Value = serde_json::from_str(&json).expect("valid JSON");
431 assert_eq!(parsed["url"].as_str(), Some("https://example.com/page"));
432 assert!(parsed["text_content"].as_str().unwrap().contains("scoped"));
433 }
434
435 #[test]
436 fn page_markdown_with_selector_no_match_returns_empty() {
437 let page = Page {
438 html: "<html><body><article>x</article></body></html>".into(),
439 ..Page::default()
440 };
441 let md = page.markdown_with_selector("", ".nonexistent").unwrap();
442 assert!(md.is_empty());
443 }
444
445 #[test]
446 fn page_markdown_with_invalid_selector_returns_error() {
447 let page = Page {
448 html: "<html><body><p>x</p></body></html>".into(),
449 ..Page::default()
450 };
451 let err = page.markdown_with_selector("", "###invalid[[[").unwrap_err();
452 assert!(err.to_string().contains("invalid CSS selector"));
453 }
454
455 #[test]
456 fn page_markdown_with_empty_selector_returns_error() {
457 let page = Page {
458 html: "<html><body><p>x</p></body></html>".into(),
459 ..Page::default()
460 };
461 assert!(page.markdown_with_selector("", "").is_err());
462 }
463
464 #[test]
465 fn fetch_rejects_invalid_url() {
466 let result = fetch(FetchOptions::new("not a url"));
467 assert!(result.is_err());
468 let err = result.unwrap_err();
469 assert!(matches!(err, Error::InvalidUrl { .. }));
470 }
471
472 #[test]
473 fn fetch_rejects_private_ip() {
474 let result = fetch(FetchOptions::new("http://127.0.0.1/"));
475 assert!(result.is_err());
476 }
477
478 #[test]
479 fn fetch_rejects_file_scheme() {
480 let result = fetch(FetchOptions::new("file:///etc/passwd"));
481 assert!(result.is_err());
482 }
483
484 mod page_from_servo {
485 use crate::bridge;
486 use crate::fetch::{ConsoleLevel, Page};
487
488 fn synthetic_image(w: u32, h: u32) -> image::RgbaImage {
489 image::RgbaImage::from_pixel(w, h, image::Rgba([255, 0, 0, 255]))
490 }
491
492 fn empty_servo_page() -> bridge::ServoPage {
493 bridge::ServoPage::default()
494 }
495
496 #[test]
497 fn extracts_title_from_html() {
498 let mut sp = empty_servo_page();
499 sp.html = "<html><head><title>Hello World</title></head></html>".into();
500 let page = Page::from_servo(sp);
501 assert_eq!(page.title.as_deref(), Some("Hello World"));
502 }
503
504 #[test]
505 fn title_is_none_when_tag_missing() {
506 let mut sp = empty_servo_page();
507 sp.html = "<html><body>no title here</body></html>".into();
508 let page = Page::from_servo(sp);
509 assert!(page.title.is_none());
510 }
511
512 #[test]
513 fn title_is_none_when_tag_empty() {
514 let mut sp = empty_servo_page();
515 sp.html = "<html><head><title></title></head></html>".into();
516 let page = Page::from_servo(sp);
517 assert!(page.title.is_none());
518 }
519
520 #[test]
521 fn title_is_none_for_empty_html() {
522 let page = Page::from_servo(empty_servo_page());
523 assert!(page.title.is_none());
524 }
525
526 #[test]
527 fn inner_text_none_becomes_empty_string() {
528 let sp = empty_servo_page();
529 assert!(sp.inner_text.is_none());
530 let page = Page::from_servo(sp);
531 assert_eq!(page.inner_text, "");
532 }
533
534 #[test]
535 fn screenshot_is_encoded_as_png() {
536 let mut sp = empty_servo_page();
537 sp.screenshot = Some(synthetic_image(8, 8));
538 let page = Page::from_servo(sp);
539 let bytes = page.screenshot_png().expect("screenshot encoded");
540 assert_eq!(&bytes[..8], b"\x89PNG\r\n\x1a\n", "PNG magic bytes");
541 }
542
543 #[test]
544 fn console_messages_empty_by_default() {
545 let page = Page::from_servo(empty_servo_page());
546 assert!(page.console_messages.is_empty());
547 }
548
549 #[test]
550 fn console_messages_preserve_all_six_levels() {
551 let cases = [
552 (bridge::ConsoleLevel::Log, ConsoleLevel::Log),
553 (bridge::ConsoleLevel::Debug, ConsoleLevel::Debug),
554 (bridge::ConsoleLevel::Info, ConsoleLevel::Info),
555 (bridge::ConsoleLevel::Warn, ConsoleLevel::Warn),
556 (bridge::ConsoleLevel::Error, ConsoleLevel::Error),
557 (bridge::ConsoleLevel::Trace, ConsoleLevel::Trace),
558 ];
559 for (src, expected) in cases {
560 let mut sp = empty_servo_page();
561 sp.console_messages = vec![bridge::ConsoleMessage {
562 level: src,
563 message: "msg".into(),
564 }];
565 let page = Page::from_servo(sp);
566 assert_eq!(
567 page.console_messages.len(),
568 1,
569 "console message lost for source level {src:?}",
570 );
571 assert_eq!(
572 page.console_messages[0].level, expected,
573 "level mapping wrong for source {src:?}",
574 );
575 }
576 }
577
578 #[test]
579 fn console_messages_preserve_ordering_across_levels() {
580 let mut sp = empty_servo_page();
581 sp.console_messages = vec![
582 bridge::ConsoleMessage {
583 level: bridge::ConsoleLevel::Info,
584 message: "first".into(),
585 },
586 bridge::ConsoleMessage {
587 level: bridge::ConsoleLevel::Error,
588 message: "second".into(),
589 },
590 bridge::ConsoleMessage {
591 level: bridge::ConsoleLevel::Warn,
592 message: "third".into(),
593 },
594 ];
595 let page = Page::from_servo(sp);
596 assert_eq!(page.console_messages.len(), 3);
597 assert_eq!(page.console_messages[0].message, "first");
598 assert_eq!(page.console_messages[1].message, "second");
599 assert_eq!(page.console_messages[2].message, "third");
600 assert_eq!(page.console_messages[0].level, ConsoleLevel::Info);
601 assert_eq!(page.console_messages[1].level, ConsoleLevel::Error);
602 assert_eq!(page.console_messages[2].level, ConsoleLevel::Warn);
603 }
604
605 #[test]
606 fn extracted_starts_as_none_until_schema_applied() {
607 let page = Page::from_servo(empty_servo_page());
608 assert!(page.extracted.is_none());
609 }
610
611 #[test]
612 fn full_round_trip_preserves_every_field() {
613 let sp = bridge::ServoPage {
614 html: "<html><head><title>T</title></head><body>B</body></html>".into(),
615 inner_text: Some("B".into()),
616 layout_json: Some("[]".into()),
617 screenshot: Some(synthetic_image(2, 2)),
618 js_result: Some("42".into()),
619 accessibility_tree: Some("{}".into()),
620 console_messages: vec![bridge::ConsoleMessage {
621 level: bridge::ConsoleLevel::Log,
622 message: "x".into(),
623 }],
624 };
625 let page = Page::from_servo(sp);
626 assert_eq!(page.html, "<html><head><title>T</title></head><body>B</body></html>");
627 assert_eq!(page.inner_text, "B");
628 assert_eq!(page.title.as_deref(), Some("T"));
629 assert_eq!(page.layout_json.as_deref(), Some("[]"));
630 assert_eq!(page.js_result.as_deref(), Some("42"));
631 assert_eq!(page.accessibility_tree.as_deref(), Some("{}"));
632 assert_eq!(page.console_messages.len(), 1);
633 assert!(page.screenshot_png().is_some());
634 assert!(page.extracted.is_none());
635 }
636 }
637}