Skip to main content

servo_fetch/
fetch.rs

1//! Single-page fetching and rendered content extraction.
2
3use std::collections::HashMap;
4use std::fmt;
5use std::sync::Arc;
6use std::time::Duration;
7
8use serde_json::Value;
9use servo::accesskit::{Node, NodeId};
10
11use crate::error::Error;
12use crate::net::sanitize_user_agent;
13
14/// Rendered page returned by [`crate::fetch()`].
15#[derive(Debug, Clone, Default, serde::Serialize)]
16#[non_exhaustive]
17pub struct Page {
18    /// Fully rendered HTML after JavaScript execution.
19    pub html: String,
20    /// Plain text content (`document.body.innerText`).
21    pub inner_text: String,
22    /// Page title extracted from `<title>` tag.
23    pub title: Option<String>,
24    /// Parsed layout data from the injected CSS heuristics script.
25    #[serde(skip_serializing_if = "Option::is_none")]
26    pub layout_json: Option<String>,
27    /// Per-node visibility flags from the visibility-aware extraction pass.
28    #[serde(skip)]
29    visibility_json: Option<String>,
30    /// Result of JavaScript evaluation, if [`FetchOptions::javascript`] was used.
31    #[serde(skip_serializing_if = "Option::is_none")]
32    pub js_result: Option<String>,
33    /// Browser console messages captured during page load.
34    pub console_messages: Vec<ConsoleMessage>,
35    /// Accessibility tree (AccessKit), serialized as JSON, if requested.
36    #[serde(skip_serializing_if = "Option::is_none")]
37    pub accessibility_tree: Option<String>,
38    /// Structured data extracted via [`FetchOptions::schema`].
39    #[serde(skip_serializing_if = "Option::is_none")]
40    pub extracted: Option<Value>,
41    /// PNG-encoded screenshot bytes — read via [`Page::screenshot_png`].
42    #[serde(skip)]
43    screenshot_png: Option<Vec<u8>>,
44    /// Typed AccessKit tree, shared cheaply across [`Page`] clones.
45    #[serde(skip)]
46    a11y: Option<Arc<HashMap<NodeId, Node>>>,
47    /// Visibility policy that was active when this page was fetched.
48    #[serde(skip)]
49    visibility_policy: crate::visibility::VisibilityPolicy,
50}
51
52impl Page {
53    /// Extract readable Markdown from this page.
54    pub fn markdown(&self) -> crate::error::Result<String> {
55        self.markdown_with_url("")
56    }
57
58    /// Extract readable Markdown, using the original URL for link resolution.
59    pub fn markdown_with_url(&self, url: &str) -> crate::error::Result<String> {
60        Ok(crate::extract::extract_text(&self.extract_input(url, None))?)
61    }
62
63    /// Extract structured JSON from this page.
64    pub fn extract_json(&self) -> crate::error::Result<String> {
65        self.extract_json_with_url("")
66    }
67
68    /// Extract structured JSON, using the original URL for link resolution.
69    pub fn extract_json_with_url(&self, url: &str) -> crate::error::Result<String> {
70        Ok(crate::extract::extract_json(&self.extract_input(url, None))?)
71    }
72
73    /// Extract readable Markdown from the subtree matched by a CSS selector.
74    pub fn markdown_with_selector(&self, url: &str, selector: &str) -> crate::error::Result<String> {
75        Ok(crate::extract::extract_text(&self.extract_input(url, Some(selector)))?)
76    }
77
78    /// Extract structured JSON from the subtree matched by a CSS selector.
79    pub fn extract_json_with_selector(&self, url: &str, selector: &str) -> crate::error::Result<String> {
80        Ok(crate::extract::extract_json(&self.extract_input(url, Some(selector)))?)
81    }
82
83    /// PNG screenshot bytes, if captured via [`FetchOptions::screenshot`].
84    #[must_use]
85    pub fn screenshot_png(&self) -> Option<&[u8]> {
86        self.screenshot_png.as_deref()
87    }
88
89    fn extract_input<'a>(&'a self, url: &'a str, selector: Option<&'a str>) -> crate::extract::ExtractInput<'a> {
90        crate::extract::ExtractInput::new(&self.html, url)
91            .with_layout_json(self.layout_json.as_deref())
92            .with_visibility_json(self.visibility_json.as_deref())
93            .with_a11y(self.a11y.as_deref())
94            .with_inner_text(Some(&self.inner_text))
95            .with_selector(selector)
96            .with_visibility(self.visibility_policy)
97    }
98
99    pub(crate) fn from_servo(page: crate::bridge::ServoPage) -> Self {
100        let title = {
101            let doc = dom_query::Document::from(page.html.as_str());
102            let t = doc.select("title").text().to_string();
103            if t.is_empty() { None } else { Some(t) }
104        };
105        let screenshot_png = page.screenshot.and_then(|img| {
106            let mut buf = std::io::Cursor::new(Vec::new());
107            img.write_to(&mut buf, image::ImageFormat::Png).ok()?;
108            Some(buf.into_inner())
109        });
110        Self {
111            html: page.html,
112            inner_text: page.inner_text.unwrap_or_default(),
113            title,
114            layout_json: page.layout_json,
115            visibility_json: page.visibility_json,
116            js_result: page.js_result,
117            console_messages: page
118                .console_messages
119                .into_iter()
120                .map(|m| ConsoleMessage {
121                    level: match m.level {
122                        crate::bridge::ConsoleLevel::Log => ConsoleLevel::Log,
123                        crate::bridge::ConsoleLevel::Debug => ConsoleLevel::Debug,
124                        crate::bridge::ConsoleLevel::Info => ConsoleLevel::Info,
125                        crate::bridge::ConsoleLevel::Warn => ConsoleLevel::Warn,
126                        crate::bridge::ConsoleLevel::Error => ConsoleLevel::Error,
127                        crate::bridge::ConsoleLevel::Trace => ConsoleLevel::Trace,
128                    },
129                    message: m.message,
130                })
131                .collect(),
132            screenshot_png,
133            accessibility_tree: page.accessibility_tree,
134            a11y: page.a11y.map(Arc::new),
135            extracted: None,
136            visibility_policy: crate::visibility::VisibilityPolicy::default(),
137        }
138    }
139}
140
141/// Browser console message captured during page load.
142#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
143#[non_exhaustive]
144pub struct ConsoleMessage {
145    /// Severity level.
146    pub level: ConsoleLevel,
147    /// Message text.
148    pub message: String,
149}
150
151/// Console message severity.
152#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
153#[serde(rename_all = "lowercase")]
154#[non_exhaustive]
155pub enum ConsoleLevel {
156    /// General log message.
157    Log,
158    /// Debug-level message.
159    Debug,
160    /// Informational message.
161    Info,
162    /// Warning message.
163    Warn,
164    /// Error message.
165    Error,
166    /// Trace-level message.
167    Trace,
168}
169
170impl ConsoleLevel {
171    /// Returns the string representation of this level.
172    #[must_use]
173    pub fn as_str(&self) -> &'static str {
174        match self {
175            Self::Log => "log",
176            Self::Debug => "debug",
177            Self::Info => "info",
178            Self::Warn => "warn",
179            Self::Error => "error",
180            Self::Trace => "trace",
181        }
182    }
183}
184
185impl fmt::Display for ConsoleLevel {
186    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
187        f.pad(self.as_str())
188    }
189}
190
191#[derive(Debug, Clone, Default)]
192pub(crate) enum FetchMode {
193    #[default]
194    Content,
195    Screenshot {
196        full_page: bool,
197    },
198    JavaScript(String),
199}
200
201/// Options for a single page fetch.
202#[must_use = "options do nothing until passed to fetch()"]
203#[derive(Debug, Clone)]
204pub struct FetchOptions {
205    pub(crate) url: String,
206    pub(crate) timeout: Option<Duration>,
207    pub(crate) settle: Option<Duration>,
208    pub(crate) mode: FetchMode,
209    pub(crate) user_agent: Option<String>,
210    pub(crate) extract_schema: Option<crate::schema::ExtractSchema>,
211    pub(crate) visibility: Option<crate::visibility::VisibilityPolicy>,
212}
213
214impl FetchOptions {
215    /// Default page-load timeout used when neither `FetchOptions::timeout` nor
216    /// a [`crate::Client`] override is set.
217    pub(crate) const DEFAULT_TIMEOUT: Duration = Duration::from_secs(30);
218
219    /// Default settle wait used when neither `FetchOptions::settle` nor a
220    /// [`crate::Client`] override is set.
221    pub(crate) const DEFAULT_SETTLE: Duration = Duration::ZERO;
222
223    /// Fetch rendered content (default mode).
224    pub fn new(url: &str) -> Self {
225        Self {
226            url: url.into(),
227            timeout: None,
228            settle: None,
229            mode: FetchMode::Content,
230            user_agent: None,
231            extract_schema: None,
232            visibility: None,
233        }
234    }
235
236    /// Capture a PNG screenshot.
237    pub fn screenshot(url: &str, full_page: bool) -> Self {
238        Self {
239            mode: FetchMode::Screenshot { full_page },
240            ..Self::new(url)
241        }
242    }
243
244    /// Execute a JavaScript expression and return the result.
245    pub fn javascript(url: &str, expression: impl Into<String>) -> Self {
246        Self {
247            mode: FetchMode::JavaScript(expression.into()),
248            ..Self::new(url)
249        }
250    }
251
252    /// Page load timeout (default: 30s).
253    pub fn timeout(mut self, timeout: Duration) -> Self {
254        self.timeout = Some(timeout);
255        self
256    }
257
258    /// Extra wait after load event for SPA hydration (default: 0).
259    pub fn settle(mut self, settle: Duration) -> Self {
260        self.settle = Some(settle);
261        self
262    }
263
264    /// Override the User-Agent string for this request.
265    pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
266        self.user_agent = Some(sanitize_user_agent(ua.into()));
267        self
268    }
269
270    /// Extract structured data from the rendered page using the given schema.
271    pub fn schema(mut self, schema: crate::schema::ExtractSchema) -> Self {
272        self.extract_schema = Some(schema);
273        self
274    }
275
276    /// Visibility-filtering policy applied during extraction.
277    pub fn visibility(mut self, policy: crate::visibility::VisibilityPolicy) -> Self {
278        self.visibility = Some(policy);
279        self
280    }
281
282    /// Resolve the effective timeout, falling back to [`Self::DEFAULT_TIMEOUT`].
283    pub(crate) fn effective_timeout(&self) -> Duration {
284        self.timeout.unwrap_or(Self::DEFAULT_TIMEOUT)
285    }
286
287    /// Resolve the effective settle wait, falling back to [`Self::DEFAULT_SETTLE`].
288    pub(crate) fn effective_settle(&self) -> Duration {
289        self.settle.unwrap_or(Self::DEFAULT_SETTLE)
290    }
291
292    /// Resolve the effective visibility policy, falling back to its default.
293    pub(crate) fn effective_visibility(&self) -> crate::visibility::VisibilityPolicy {
294        self.visibility.unwrap_or_default()
295    }
296}
297
298/// Fetch a single page via the embedded Servo engine (blocking).
299pub fn fetch_blocking(opts: &FetchOptions) -> crate::error::Result<Page> {
300    if let Some(pdf_page) = pre_fetch(opts)? {
301        return Ok(pdf_page);
302    }
303    let bridge_opts = build_bridge_options(opts);
304    let servo_page = crate::bridge::fetch_page(bridge_opts).map_err(|e| map_engine_error(e, opts))?;
305    Ok(finalize_page(servo_page, opts))
306}
307
308/// Fetch a single page via the embedded Servo engine.
309pub async fn fetch(opts: &FetchOptions) -> crate::error::Result<Page> {
310    if let Some(pdf_page) = pre_fetch_async(opts).await? {
311        return Ok(pdf_page);
312    }
313    let bridge_opts = build_bridge_options(opts);
314    let servo_page = crate::bridge::fetch_page_async(bridge_opts)
315        .await
316        .map_err(|e| map_engine_error(e, opts))?;
317    Ok(finalize_page(servo_page, opts))
318}
319
320/// Fetch a URL and return readable Markdown (blocking).
321pub fn markdown_blocking(url: &str) -> crate::error::Result<String> {
322    fetch_blocking(&FetchOptions::new(url))?.markdown_with_url(url)
323}
324
325/// Fetch a URL and return readable Markdown.
326pub async fn markdown(url: &str) -> crate::error::Result<String> {
327    fetch(&FetchOptions::new(url)).await?.markdown_with_url(url)
328}
329
330/// Fetch a URL and return structured JSON (blocking).
331pub fn extract_json_blocking(url: &str) -> crate::error::Result<String> {
332    fetch_blocking(&FetchOptions::new(url))?.extract_json_with_url(url)
333}
334
335/// Fetch a URL and return structured JSON.
336pub async fn extract_json(url: &str) -> crate::error::Result<String> {
337    fetch(&FetchOptions::new(url)).await?.extract_json_with_url(url)
338}
339
340/// Fetch a URL and return plain text (`document.body.innerText`) (blocking).
341pub fn text_blocking(url: &str) -> crate::error::Result<String> {
342    Ok(fetch_blocking(&FetchOptions::new(url))?.inner_text)
343}
344
345/// Fetch a URL and return plain text (`document.body.innerText`).
346pub async fn text(url: &str) -> crate::error::Result<String> {
347    Ok(fetch(&FetchOptions::new(url)).await?.inner_text)
348}
349
350fn pre_fetch(opts: &FetchOptions) -> crate::error::Result<Option<Page>> {
351    crate::net::ensure_crypto_provider();
352    crate::net::validate_url(&opts.url)?;
353
354    if matches!(opts.mode, FetchMode::Content)
355        && let Some(bytes) = crate::pdf::probe(&opts.url, opts.effective_timeout().as_secs().max(1))
356    {
357        return Ok(Some(pdf_page(&bytes)));
358    }
359
360    Ok(None)
361}
362
363async fn pre_fetch_async(opts: &FetchOptions) -> crate::error::Result<Option<Page>> {
364    crate::net::ensure_crypto_provider();
365    crate::net::validate_url(&opts.url)?;
366
367    if matches!(opts.mode, FetchMode::Content) {
368        let url = opts.url.clone();
369        let timeout_secs = opts.effective_timeout().as_secs().max(1);
370        let probe = tokio::task::spawn_blocking(move || crate::pdf::probe(&url, timeout_secs))
371            .await
372            .map_err(|e| Error::engine(anyhow::anyhow!("pdf probe task panicked: {e}"), Some(opts.url.clone())))?;
373        if let Some(bytes) = probe {
374            return Ok(Some(pdf_page(&bytes)));
375        }
376    }
377
378    Ok(None)
379}
380
381fn pdf_page(bytes: &[u8]) -> Page {
382    let text = crate::extract::extract_pdf(bytes);
383    Page {
384        html: String::new(),
385        inner_text: text,
386        ..Page::default()
387    }
388}
389
390fn build_bridge_options(opts: &FetchOptions) -> crate::bridge::FetchOptions<'_> {
391    crate::bridge::FetchOptions {
392        url: &opts.url,
393        timeout_secs: opts.effective_timeout().as_secs().max(1),
394        settle_ms: u64::try_from(opts.effective_settle().as_millis()).unwrap_or(u64::MAX),
395        user_agent: opts.user_agent.as_deref(),
396        mode: match opts.mode {
397            FetchMode::Content => crate::bridge::FetchMode::Content { include_a11y: false },
398            FetchMode::Screenshot { full_page } => crate::bridge::FetchMode::Screenshot { full_page },
399            FetchMode::JavaScript(ref expr) => crate::bridge::FetchMode::ExecuteJs {
400                expression: expr.clone(),
401            },
402        },
403    }
404}
405
406fn finalize_page(servo_page: crate::bridge::ServoPage, opts: &FetchOptions) -> Page {
407    let mut page = Page::from_servo(servo_page);
408    page.visibility_policy = opts.effective_visibility();
409    if let Some(schema) = opts.extract_schema.as_ref() {
410        page.extracted = Some(schema.extract_from(&page.html));
411    }
412    page
413}
414
415fn map_engine_error(e: anyhow::Error, opts: &FetchOptions) -> Error {
416    if format!("{e:#}").contains("timed out") {
417        Error::Timeout {
418            url: opts.url.clone(),
419            timeout: opts.effective_timeout(),
420        }
421    } else {
422        Error::engine(e, Some(opts.url.clone()))
423    }
424}
425
426#[cfg(test)]
427mod tests {
428    use super::*;
429
430    #[test]
431    fn fetch_options_defaults() {
432        let opts = FetchOptions::new("https://example.com");
433        assert_eq!(opts.url, "https://example.com");
434        assert_eq!(opts.timeout, None);
435        assert_eq!(opts.settle, None);
436        assert_eq!(opts.visibility, None);
437        assert!(matches!(opts.mode, FetchMode::Content));
438    }
439
440    #[test]
441    fn fetch_options_effective_defaults() {
442        let opts = FetchOptions::new("https://example.com");
443        assert_eq!(opts.effective_timeout(), Duration::from_secs(30));
444        assert_eq!(opts.effective_settle(), Duration::ZERO);
445    }
446
447    #[test]
448    fn fetch_options_caller_value_preserved() {
449        let opts = FetchOptions::new("https://example.com")
450            .timeout(Duration::from_secs(45))
451            .settle(Duration::from_millis(250));
452        assert_eq!(opts.timeout, Some(Duration::from_secs(45)));
453        assert_eq!(opts.settle, Some(Duration::from_millis(250)));
454        assert_eq!(opts.effective_timeout(), Duration::from_secs(45));
455        assert_eq!(opts.effective_settle(), Duration::from_millis(250));
456    }
457
458    #[test]
459    fn fetch_options_screenshot() {
460        let opts = FetchOptions::screenshot("https://example.com", true);
461        assert!(matches!(opts.mode, FetchMode::Screenshot { full_page: true }));
462    }
463
464    #[test]
465    fn fetch_options_javascript() {
466        let opts = FetchOptions::javascript("https://example.com", "document.title");
467        assert!(matches!(opts.mode, FetchMode::JavaScript(ref e) if e == "document.title"));
468    }
469
470    #[test]
471    fn fetch_options_chaining() {
472        let opts = FetchOptions::new("https://example.com")
473            .timeout(Duration::from_secs(60))
474            .settle(Duration::from_millis(500));
475        assert_eq!(opts.timeout, Some(Duration::from_secs(60)));
476        assert_eq!(opts.settle, Some(Duration::from_millis(500)));
477    }
478
479    #[test]
480    fn fetch_user_agent_set() {
481        let opts = FetchOptions::new("https://example.com").user_agent("MyBot/1.0");
482        assert_eq!(opts.user_agent.as_deref(), Some("MyBot/1.0"));
483    }
484
485    #[test]
486    fn fetch_user_agent_default_is_none() {
487        let opts = FetchOptions::new("https://example.com");
488        assert!(opts.user_agent.is_none());
489    }
490
491    #[test]
492    fn fetch_user_agent_sanitizes_crlf() {
493        let opts = FetchOptions::new("https://example.com").user_agent("Bot\r\nX-Evil: yes");
494        assert_eq!(opts.user_agent.as_deref(), Some("Bot  X-Evil: yes"));
495    }
496
497    #[test]
498    fn fetch_user_agent_sanitizes_null() {
499        let opts = FetchOptions::new("https://example.com").user_agent("Bot\0/1.0");
500        assert_eq!(opts.user_agent.as_deref(), Some("Bot /1.0"));
501    }
502
503    #[test]
504    fn fetch_user_agent_empty_string() {
505        let opts = FetchOptions::new("https://example.com").user_agent("");
506        assert_eq!(opts.user_agent.as_deref(), Some(""));
507    }
508
509    #[test]
510    fn page_markdown_from_html() {
511        let page = Page {
512            html: "<html><head><title>Test</title></head><body><p>hello world</p></body></html>".into(),
513            inner_text: "hello world".into(),
514            ..Page::default()
515        };
516        let md = page.markdown().unwrap();
517        assert!(md.contains("hello world"));
518    }
519
520    #[test]
521    fn page_extract_json_produces_valid_json() {
522        let page = Page {
523            html: "<html><head><title>Test</title></head><body><p>content</p></body></html>".into(),
524            inner_text: "content".into(),
525            ..Page::default()
526        };
527        let json = page.extract_json().unwrap();
528        let _: Value = serde_json::from_str(&json).expect("valid JSON");
529    }
530
531    #[test]
532    fn page_screenshot_png_none_by_default() {
533        let page = Page::default();
534        assert!(page.screenshot_png().is_none());
535    }
536
537    #[test]
538    fn page_markdown_with_selector_scopes_to_subtree() {
539        let page = Page {
540            html: "<html><body><article>keep</article><aside>drop</aside></body></html>".into(),
541            ..Page::default()
542        };
543        let md = page.markdown_with_selector("https://example.com", "article").unwrap();
544        assert!(md.contains("keep"));
545        assert!(!md.contains("drop"));
546    }
547
548    #[test]
549    fn page_extract_json_with_selector_includes_url() {
550        let page = Page {
551            html: "<html><body><article>scoped</article></body></html>".into(),
552            ..Page::default()
553        };
554        let json = page
555            .extract_json_with_selector("https://example.com/page", "article")
556            .unwrap();
557        let parsed: Value = serde_json::from_str(&json).expect("valid JSON");
558        assert_eq!(parsed["url"].as_str(), Some("https://example.com/page"));
559        assert!(parsed["text_content"].as_str().unwrap().contains("scoped"));
560    }
561
562    #[test]
563    fn page_markdown_with_selector_no_match_returns_empty() {
564        let page = Page {
565            html: "<html><body><article>x</article></body></html>".into(),
566            ..Page::default()
567        };
568        let md = page.markdown_with_selector("", ".nonexistent").unwrap();
569        assert!(md.is_empty());
570    }
571
572    #[test]
573    fn page_markdown_with_invalid_selector_returns_error() {
574        let page = Page {
575            html: "<html><body><p>x</p></body></html>".into(),
576            ..Page::default()
577        };
578        let err = page.markdown_with_selector("", "###invalid[[[").unwrap_err();
579        assert!(err.to_string().contains("invalid CSS selector"));
580    }
581
582    #[test]
583    fn page_markdown_with_empty_selector_returns_error() {
584        let page = Page {
585            html: "<html><body><p>x</p></body></html>".into(),
586            ..Page::default()
587        };
588        assert!(page.markdown_with_selector("", "").is_err());
589    }
590
591    #[test]
592    fn fetch_rejects_invalid_url() {
593        let result = fetch_blocking(&FetchOptions::new("not a url"));
594        assert!(result.is_err());
595        let err = result.unwrap_err();
596        assert!(matches!(err, Error::InvalidUrl { .. }));
597    }
598
599    #[test]
600    fn fetch_rejects_private_ip() {
601        let result = fetch_blocking(&FetchOptions::new("http://127.0.0.1/"));
602        assert!(result.is_err());
603    }
604
605    #[test]
606    fn fetch_rejects_file_scheme() {
607        let result = fetch_blocking(&FetchOptions::new("file:///etc/passwd"));
608        assert!(result.is_err());
609    }
610
611    mod page_from_servo {
612        use crate::bridge;
613        use crate::fetch::{ConsoleLevel, Page};
614
615        fn synthetic_image(w: u32, h: u32) -> image::RgbaImage {
616            image::RgbaImage::from_pixel(w, h, image::Rgba([255, 0, 0, 255]))
617        }
618
619        fn empty_servo_page() -> bridge::ServoPage {
620            bridge::ServoPage::default()
621        }
622
623        #[test]
624        fn extracts_title_from_html() {
625            let mut sp = empty_servo_page();
626            sp.html = "<html><head><title>Hello World</title></head></html>".into();
627            let page = Page::from_servo(sp);
628            assert_eq!(page.title.as_deref(), Some("Hello World"));
629        }
630
631        #[test]
632        fn title_is_none_when_tag_missing() {
633            let mut sp = empty_servo_page();
634            sp.html = "<html><body>no title here</body></html>".into();
635            let page = Page::from_servo(sp);
636            assert!(page.title.is_none());
637        }
638
639        #[test]
640        fn title_is_none_when_tag_empty() {
641            let mut sp = empty_servo_page();
642            sp.html = "<html><head><title></title></head></html>".into();
643            let page = Page::from_servo(sp);
644            assert!(page.title.is_none());
645        }
646
647        #[test]
648        fn title_is_none_for_empty_html() {
649            let page = Page::from_servo(empty_servo_page());
650            assert!(page.title.is_none());
651        }
652
653        #[test]
654        fn inner_text_none_becomes_empty_string() {
655            let sp = empty_servo_page();
656            assert!(sp.inner_text.is_none());
657            let page = Page::from_servo(sp);
658            assert_eq!(page.inner_text, "");
659        }
660
661        #[test]
662        fn screenshot_is_encoded_as_png() {
663            let mut sp = empty_servo_page();
664            sp.screenshot = Some(synthetic_image(8, 8));
665            let page = Page::from_servo(sp);
666            let bytes = page.screenshot_png().expect("screenshot encoded");
667            assert_eq!(&bytes[..8], b"\x89PNG\r\n\x1a\n", "PNG magic bytes");
668        }
669
670        #[test]
671        fn console_messages_empty_by_default() {
672            let page = Page::from_servo(empty_servo_page());
673            assert!(page.console_messages.is_empty());
674        }
675
676        #[test]
677        fn console_messages_preserve_all_six_levels() {
678            let cases = [
679                (bridge::ConsoleLevel::Log, ConsoleLevel::Log),
680                (bridge::ConsoleLevel::Debug, ConsoleLevel::Debug),
681                (bridge::ConsoleLevel::Info, ConsoleLevel::Info),
682                (bridge::ConsoleLevel::Warn, ConsoleLevel::Warn),
683                (bridge::ConsoleLevel::Error, ConsoleLevel::Error),
684                (bridge::ConsoleLevel::Trace, ConsoleLevel::Trace),
685            ];
686            for (src, expected) in cases {
687                let mut sp = empty_servo_page();
688                sp.console_messages = vec![bridge::ConsoleMessage {
689                    level: src,
690                    message: "msg".into(),
691                }];
692                let page = Page::from_servo(sp);
693                assert_eq!(
694                    page.console_messages.len(),
695                    1,
696                    "console message lost for source level {src:?}",
697                );
698                assert_eq!(
699                    page.console_messages[0].level, expected,
700                    "level mapping wrong for source {src:?}",
701                );
702            }
703        }
704
705        #[test]
706        fn console_messages_preserve_ordering_across_levels() {
707            let mut sp = empty_servo_page();
708            sp.console_messages = vec![
709                bridge::ConsoleMessage {
710                    level: bridge::ConsoleLevel::Info,
711                    message: "first".into(),
712                },
713                bridge::ConsoleMessage {
714                    level: bridge::ConsoleLevel::Error,
715                    message: "second".into(),
716                },
717                bridge::ConsoleMessage {
718                    level: bridge::ConsoleLevel::Warn,
719                    message: "third".into(),
720                },
721            ];
722            let page = Page::from_servo(sp);
723            assert_eq!(page.console_messages.len(), 3);
724            assert_eq!(page.console_messages[0].message, "first");
725            assert_eq!(page.console_messages[1].message, "second");
726            assert_eq!(page.console_messages[2].message, "third");
727            assert_eq!(page.console_messages[0].level, ConsoleLevel::Info);
728            assert_eq!(page.console_messages[1].level, ConsoleLevel::Error);
729            assert_eq!(page.console_messages[2].level, ConsoleLevel::Warn);
730        }
731
732        #[test]
733        fn extracted_starts_as_none_until_schema_applied() {
734            let page = Page::from_servo(empty_servo_page());
735            assert!(page.extracted.is_none());
736        }
737
738        #[test]
739        fn full_round_trip_preserves_every_field() {
740            let sp = bridge::ServoPage {
741                html: "<html><head><title>T</title></head><body>B</body></html>".into(),
742                inner_text: Some("B".into()),
743                layout_json: Some("[]".into()),
744                visibility_json: Some("[]".into()),
745                screenshot: Some(synthetic_image(2, 2)),
746                js_result: Some("42".into()),
747                accessibility_tree: Some("{}".into()),
748                a11y: None,
749                console_messages: vec![bridge::ConsoleMessage {
750                    level: bridge::ConsoleLevel::Log,
751                    message: "x".into(),
752                }],
753            };
754            let page = Page::from_servo(sp);
755            assert_eq!(page.html, "<html><head><title>T</title></head><body>B</body></html>");
756            assert_eq!(page.inner_text, "B");
757            assert_eq!(page.title.as_deref(), Some("T"));
758            assert_eq!(page.layout_json.as_deref(), Some("[]"));
759            assert_eq!(page.js_result.as_deref(), Some("42"));
760            assert_eq!(page.accessibility_tree.as_deref(), Some("{}"));
761            assert_eq!(page.console_messages.len(), 1);
762            assert!(page.screenshot_png().is_some());
763            assert!(page.extracted.is_none());
764        }
765    }
766}