Skip to main content

servo_fetch/
fetch.rs

1//! Single-page fetching and rendered content extraction.
2
3use std::collections::HashMap;
4use std::sync::Arc;
5use std::time::Duration;
6
7use servo::accesskit::{Node, NodeId};
8
9use crate::error::Error;
10use crate::net::sanitize_user_agent;
11
12/// Rendered page returned by [`fetch`].
13#[derive(Debug, Clone, Default, serde::Serialize)]
14#[non_exhaustive]
15pub struct Page {
16    /// Fully rendered HTML after JavaScript execution.
17    pub html: String,
18    /// Plain text content (`document.body.innerText`).
19    pub inner_text: String,
20    /// Page title extracted from `<title>` tag.
21    pub title: Option<String>,
22    /// Parsed layout data from the injected CSS heuristics script.
23    #[serde(skip_serializing_if = "Option::is_none")]
24    pub layout_json: Option<String>,
25    /// Per-node visibility flags from the visibility-aware extraction pass.
26    #[serde(skip)]
27    visibility_json: Option<String>,
28    /// Result of JavaScript evaluation, if [`FetchOptions::javascript`] was used.
29    #[serde(skip_serializing_if = "Option::is_none")]
30    pub js_result: Option<String>,
31    /// Browser console messages captured during page load.
32    pub console_messages: Vec<ConsoleMessage>,
33    /// Accessibility tree (AccessKit), serialized as JSON, if requested.
34    #[serde(skip_serializing_if = "Option::is_none")]
35    pub accessibility_tree: Option<String>,
36    /// Structured data extracted via [`FetchOptions::schema`].
37    #[serde(skip_serializing_if = "Option::is_none")]
38    pub extracted: Option<serde_json::Value>,
39    /// PNG-encoded screenshot bytes — read via [`Page::screenshot_png`].
40    #[serde(skip)]
41    screenshot_png: Option<Vec<u8>>,
42    /// Typed AccessKit tree, shared cheaply across [`Page`] clones.
43    #[serde(skip)]
44    a11y: Option<Arc<HashMap<NodeId, Node>>>,
45    /// Visibility policy that was active when this page was fetched.
46    #[serde(skip)]
47    visibility_policy: crate::visibility::VisibilityPolicy,
48}
49
50impl Page {
51    /// Extract readable Markdown from this page.
52    pub fn markdown(&self) -> crate::error::Result<String> {
53        self.markdown_with_url("")
54    }
55
56    /// Extract readable Markdown, using the original URL for link resolution.
57    pub fn markdown_with_url(&self, url: &str) -> crate::error::Result<String> {
58        Ok(crate::extract::extract_text(&self.extract_input(url, None))?)
59    }
60
61    /// Extract structured JSON from this page.
62    pub fn extract_json(&self) -> crate::error::Result<String> {
63        self.extract_json_with_url("")
64    }
65
66    /// Extract structured JSON, using the original URL for link resolution.
67    pub fn extract_json_with_url(&self, url: &str) -> crate::error::Result<String> {
68        Ok(crate::extract::extract_json(&self.extract_input(url, None))?)
69    }
70
71    /// Extract readable Markdown from the subtree matched by a CSS selector.
72    pub fn markdown_with_selector(&self, url: &str, selector: &str) -> crate::error::Result<String> {
73        Ok(crate::extract::extract_text(&self.extract_input(url, Some(selector)))?)
74    }
75
76    /// Extract structured JSON from the subtree matched by a CSS selector.
77    pub fn extract_json_with_selector(&self, url: &str, selector: &str) -> crate::error::Result<String> {
78        Ok(crate::extract::extract_json(&self.extract_input(url, Some(selector)))?)
79    }
80
81    /// PNG screenshot bytes, if captured via [`FetchOptions::screenshot`].
82    #[must_use]
83    pub fn screenshot_png(&self) -> Option<&[u8]> {
84        self.screenshot_png.as_deref()
85    }
86
87    fn extract_input<'a>(&'a self, url: &'a str, selector: Option<&'a str>) -> crate::extract::ExtractInput<'a> {
88        crate::extract::ExtractInput::new(&self.html, url)
89            .with_layout_json(self.layout_json.as_deref())
90            .with_visibility_json(self.visibility_json.as_deref())
91            .with_a11y(self.a11y.as_deref())
92            .with_inner_text(Some(&self.inner_text))
93            .with_selector(selector)
94            .with_visibility(self.visibility_policy)
95    }
96
97    pub(crate) fn from_servo(page: crate::bridge::ServoPage) -> Self {
98        let title = {
99            let doc = dom_query::Document::from(page.html.as_str());
100            let t = doc.select("title").text().to_string();
101            if t.is_empty() { None } else { Some(t) }
102        };
103        let screenshot_png = page.screenshot.and_then(|img| {
104            let mut buf = std::io::Cursor::new(Vec::new());
105            img.write_to(&mut buf, image::ImageFormat::Png).ok()?;
106            Some(buf.into_inner())
107        });
108        Self {
109            html: page.html,
110            inner_text: page.inner_text.unwrap_or_default(),
111            title,
112            layout_json: page.layout_json,
113            visibility_json: page.visibility_json,
114            js_result: page.js_result,
115            console_messages: page
116                .console_messages
117                .into_iter()
118                .map(|m| ConsoleMessage {
119                    level: match m.level {
120                        crate::bridge::ConsoleLevel::Log => ConsoleLevel::Log,
121                        crate::bridge::ConsoleLevel::Debug => ConsoleLevel::Debug,
122                        crate::bridge::ConsoleLevel::Info => ConsoleLevel::Info,
123                        crate::bridge::ConsoleLevel::Warn => ConsoleLevel::Warn,
124                        crate::bridge::ConsoleLevel::Error => ConsoleLevel::Error,
125                        crate::bridge::ConsoleLevel::Trace => ConsoleLevel::Trace,
126                    },
127                    message: m.message,
128                })
129                .collect(),
130            screenshot_png,
131            accessibility_tree: page.accessibility_tree,
132            a11y: page.a11y.map(Arc::new),
133            extracted: None,
134            visibility_policy: crate::visibility::VisibilityPolicy::default(),
135        }
136    }
137}
138
139/// Browser console message captured during page load.
140#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
141#[non_exhaustive]
142pub struct ConsoleMessage {
143    /// Severity level.
144    pub level: ConsoleLevel,
145    /// Message text.
146    pub message: String,
147}
148
149/// Console message severity.
150#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
151#[serde(rename_all = "lowercase")]
152#[non_exhaustive]
153pub enum ConsoleLevel {
154    /// General log message.
155    Log,
156    /// Debug-level message.
157    Debug,
158    /// Informational message.
159    Info,
160    /// Warning message.
161    Warn,
162    /// Error message.
163    Error,
164    /// Trace-level message.
165    Trace,
166}
167
168impl ConsoleLevel {
169    /// Returns the string representation of this level.
170    #[must_use]
171    pub fn as_str(&self) -> &'static str {
172        match self {
173            Self::Log => "log",
174            Self::Debug => "debug",
175            Self::Info => "info",
176            Self::Warn => "warn",
177            Self::Error => "error",
178            Self::Trace => "trace",
179        }
180    }
181}
182
183impl std::fmt::Display for ConsoleLevel {
184    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
185        f.pad(self.as_str())
186    }
187}
188
189#[derive(Debug, Clone, Default)]
190pub(crate) enum FetchMode {
191    #[default]
192    Content,
193    Screenshot {
194        full_page: bool,
195    },
196    JavaScript(String),
197}
198
199/// Options for a single page fetch.
200#[must_use = "options do nothing until passed to fetch()"]
201#[derive(Debug, Clone)]
202pub struct FetchOptions {
203    pub(crate) url: String,
204    pub(crate) timeout: Duration,
205    pub(crate) settle: Duration,
206    pub(crate) mode: FetchMode,
207    pub(crate) user_agent: Option<String>,
208    pub(crate) extract_schema: Option<crate::schema::ExtractSchema>,
209    pub(crate) visibility: crate::visibility::VisibilityPolicy,
210}
211
212impl FetchOptions {
213    /// Fetch rendered content (default mode).
214    pub fn new(url: &str) -> Self {
215        Self {
216            url: url.into(),
217            timeout: Duration::from_secs(30),
218            settle: Duration::ZERO,
219            mode: FetchMode::Content,
220            user_agent: None,
221            extract_schema: None,
222            visibility: crate::visibility::VisibilityPolicy::default(),
223        }
224    }
225
226    /// Capture a PNG screenshot.
227    pub fn screenshot(url: &str, full_page: bool) -> Self {
228        Self {
229            mode: FetchMode::Screenshot { full_page },
230            ..Self::new(url)
231        }
232    }
233
234    /// Execute a JavaScript expression and return the result.
235    pub fn javascript(url: &str, expression: impl Into<String>) -> Self {
236        Self {
237            mode: FetchMode::JavaScript(expression.into()),
238            ..Self::new(url)
239        }
240    }
241
242    /// Page load timeout (default: 30s).
243    pub fn timeout(mut self, timeout: Duration) -> Self {
244        self.timeout = timeout;
245        self
246    }
247
248    /// Extra wait after load event for SPA hydration (default: 0).
249    pub fn settle(mut self, settle: Duration) -> Self {
250        self.settle = settle;
251        self
252    }
253
254    /// Override the User-Agent string for this request.
255    pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
256        self.user_agent = Some(sanitize_user_agent(ua.into()));
257        self
258    }
259
260    /// Extract structured data from the rendered page using the given schema.
261    pub fn schema(mut self, schema: crate::schema::ExtractSchema) -> Self {
262        self.extract_schema = Some(schema);
263        self
264    }
265
266    /// Visibility-filtering policy applied during extraction.
267    pub fn visibility(mut self, policy: crate::visibility::VisibilityPolicy) -> Self {
268        self.visibility = policy;
269        self
270    }
271}
272
273/// Fetch a single page via the embedded Servo engine.
274#[allow(clippy::needless_pass_by_value)]
275pub fn fetch(opts: FetchOptions) -> crate::error::Result<Page> {
276    crate::net::ensure_crypto_provider();
277
278    crate::net::validate_url(&opts.url)?;
279
280    if matches!(opts.mode, FetchMode::Content)
281        && let Some(bytes) = crate::pdf::probe(&opts.url, opts.timeout.as_secs().max(1))
282    {
283        let text = crate::extract::extract_pdf(&bytes);
284        return Ok(Page {
285            html: String::new(),
286            inner_text: text,
287            ..Page::default()
288        });
289    }
290
291    let bridge_opts = crate::bridge::FetchOptions {
292        url: &opts.url,
293        timeout_secs: opts.timeout.as_secs().max(1),
294        settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
295        user_agent: opts.user_agent.as_deref(),
296        mode: match opts.mode {
297            FetchMode::Content => crate::bridge::FetchMode::Content { include_a11y: false },
298            FetchMode::Screenshot { full_page } => crate::bridge::FetchMode::Screenshot { full_page },
299            FetchMode::JavaScript(ref expr) => crate::bridge::FetchMode::ExecuteJs {
300                expression: expr.clone(),
301            },
302        },
303    };
304
305    let servo_page = crate::bridge::fetch_page(bridge_opts).map_err(|e| {
306        let msg = format!("{e:#}");
307        if msg.contains("timed out") {
308            Error::Timeout {
309                url: opts.url.clone(),
310                timeout: opts.timeout,
311            }
312        } else {
313            Error::Engine(msg)
314        }
315    })?;
316
317    let mut page = Page::from_servo(servo_page);
318    page.visibility_policy = opts.visibility;
319    if let Some(schema) = opts.extract_schema.as_ref() {
320        page.extracted = Some(schema.extract_from(&page.html));
321    }
322    Ok(page)
323}
324
325/// Fetch a URL and return readable Markdown.
326pub fn markdown(url: &str) -> crate::error::Result<String> {
327    fetch(FetchOptions::new(url))?.markdown_with_url(url)
328}
329
330/// Fetch a URL and return structured JSON.
331pub fn extract_json(url: &str) -> crate::error::Result<String> {
332    fetch(FetchOptions::new(url))?.extract_json_with_url(url)
333}
334
335/// Fetch a URL and return plain text (`document.body.innerText`).
336pub fn text(url: &str) -> crate::error::Result<String> {
337    Ok(fetch(FetchOptions::new(url))?.inner_text)
338}
339
340#[cfg(test)]
341mod tests {
342    use super::*;
343
344    #[test]
345    fn fetch_options_defaults() {
346        let opts = FetchOptions::new("https://example.com");
347        assert_eq!(opts.url, "https://example.com");
348        assert_eq!(opts.timeout, Duration::from_secs(30));
349        assert_eq!(opts.settle, Duration::ZERO);
350        assert!(matches!(opts.mode, FetchMode::Content));
351    }
352
353    #[test]
354    fn fetch_options_screenshot() {
355        let opts = FetchOptions::screenshot("https://example.com", true);
356        assert!(matches!(opts.mode, FetchMode::Screenshot { full_page: true }));
357    }
358
359    #[test]
360    fn fetch_options_javascript() {
361        let opts = FetchOptions::javascript("https://example.com", "document.title");
362        assert!(matches!(opts.mode, FetchMode::JavaScript(ref e) if e == "document.title"));
363    }
364
365    #[test]
366    fn fetch_options_chaining() {
367        let opts = FetchOptions::new("https://example.com")
368            .timeout(Duration::from_secs(60))
369            .settle(Duration::from_millis(500));
370        assert_eq!(opts.timeout, Duration::from_secs(60));
371        assert_eq!(opts.settle, Duration::from_millis(500));
372    }
373
374    #[test]
375    fn fetch_user_agent_set() {
376        let opts = FetchOptions::new("https://example.com").user_agent("MyBot/1.0");
377        assert_eq!(opts.user_agent.as_deref(), Some("MyBot/1.0"));
378    }
379
380    #[test]
381    fn fetch_user_agent_default_is_none() {
382        let opts = FetchOptions::new("https://example.com");
383        assert!(opts.user_agent.is_none());
384    }
385
386    #[test]
387    fn fetch_user_agent_sanitizes_crlf() {
388        let opts = FetchOptions::new("https://example.com").user_agent("Bot\r\nX-Evil: yes");
389        assert_eq!(opts.user_agent.as_deref(), Some("Bot  X-Evil: yes"));
390    }
391
392    #[test]
393    fn fetch_user_agent_sanitizes_null() {
394        let opts = FetchOptions::new("https://example.com").user_agent("Bot\0/1.0");
395        assert_eq!(opts.user_agent.as_deref(), Some("Bot /1.0"));
396    }
397
398    #[test]
399    fn fetch_user_agent_empty_string() {
400        let opts = FetchOptions::new("https://example.com").user_agent("");
401        assert_eq!(opts.user_agent.as_deref(), Some(""));
402    }
403
404    #[test]
405    fn page_markdown_from_html() {
406        let page = Page {
407            html: "<html><head><title>Test</title></head><body><p>hello world</p></body></html>".into(),
408            inner_text: "hello world".into(),
409            ..Page::default()
410        };
411        let md = page.markdown().unwrap();
412        assert!(md.contains("hello world"));
413    }
414
415    #[test]
416    fn page_extract_json_produces_valid_json() {
417        let page = Page {
418            html: "<html><head><title>Test</title></head><body><p>content</p></body></html>".into(),
419            inner_text: "content".into(),
420            ..Page::default()
421        };
422        let json = page.extract_json().unwrap();
423        let _: serde_json::Value = serde_json::from_str(&json).expect("valid JSON");
424    }
425
426    #[test]
427    fn page_screenshot_png_none_by_default() {
428        let page = Page::default();
429        assert!(page.screenshot_png().is_none());
430    }
431
432    #[test]
433    fn page_markdown_with_selector_scopes_to_subtree() {
434        let page = Page {
435            html: "<html><body><article>keep</article><aside>drop</aside></body></html>".into(),
436            ..Page::default()
437        };
438        let md = page.markdown_with_selector("https://example.com", "article").unwrap();
439        assert!(md.contains("keep"));
440        assert!(!md.contains("drop"));
441    }
442
443    #[test]
444    fn page_extract_json_with_selector_includes_url() {
445        let page = Page {
446            html: "<html><body><article>scoped</article></body></html>".into(),
447            ..Page::default()
448        };
449        let json = page
450            .extract_json_with_selector("https://example.com/page", "article")
451            .unwrap();
452        let parsed: serde_json::Value = serde_json::from_str(&json).expect("valid JSON");
453        assert_eq!(parsed["url"].as_str(), Some("https://example.com/page"));
454        assert!(parsed["text_content"].as_str().unwrap().contains("scoped"));
455    }
456
457    #[test]
458    fn page_markdown_with_selector_no_match_returns_empty() {
459        let page = Page {
460            html: "<html><body><article>x</article></body></html>".into(),
461            ..Page::default()
462        };
463        let md = page.markdown_with_selector("", ".nonexistent").unwrap();
464        assert!(md.is_empty());
465    }
466
467    #[test]
468    fn page_markdown_with_invalid_selector_returns_error() {
469        let page = Page {
470            html: "<html><body><p>x</p></body></html>".into(),
471            ..Page::default()
472        };
473        let err = page.markdown_with_selector("", "###invalid[[[").unwrap_err();
474        assert!(err.to_string().contains("invalid CSS selector"));
475    }
476
477    #[test]
478    fn page_markdown_with_empty_selector_returns_error() {
479        let page = Page {
480            html: "<html><body><p>x</p></body></html>".into(),
481            ..Page::default()
482        };
483        assert!(page.markdown_with_selector("", "").is_err());
484    }
485
486    #[test]
487    fn fetch_rejects_invalid_url() {
488        let result = fetch(FetchOptions::new("not a url"));
489        assert!(result.is_err());
490        let err = result.unwrap_err();
491        assert!(matches!(err, Error::InvalidUrl { .. }));
492    }
493
494    #[test]
495    fn fetch_rejects_private_ip() {
496        let result = fetch(FetchOptions::new("http://127.0.0.1/"));
497        assert!(result.is_err());
498    }
499
500    #[test]
501    fn fetch_rejects_file_scheme() {
502        let result = fetch(FetchOptions::new("file:///etc/passwd"));
503        assert!(result.is_err());
504    }
505
506    mod page_from_servo {
507        use crate::bridge;
508        use crate::fetch::{ConsoleLevel, Page};
509
510        fn synthetic_image(w: u32, h: u32) -> image::RgbaImage {
511            image::RgbaImage::from_pixel(w, h, image::Rgba([255, 0, 0, 255]))
512        }
513
514        fn empty_servo_page() -> bridge::ServoPage {
515            bridge::ServoPage::default()
516        }
517
518        #[test]
519        fn extracts_title_from_html() {
520            let mut sp = empty_servo_page();
521            sp.html = "<html><head><title>Hello World</title></head></html>".into();
522            let page = Page::from_servo(sp);
523            assert_eq!(page.title.as_deref(), Some("Hello World"));
524        }
525
526        #[test]
527        fn title_is_none_when_tag_missing() {
528            let mut sp = empty_servo_page();
529            sp.html = "<html><body>no title here</body></html>".into();
530            let page = Page::from_servo(sp);
531            assert!(page.title.is_none());
532        }
533
534        #[test]
535        fn title_is_none_when_tag_empty() {
536            let mut sp = empty_servo_page();
537            sp.html = "<html><head><title></title></head></html>".into();
538            let page = Page::from_servo(sp);
539            assert!(page.title.is_none());
540        }
541
542        #[test]
543        fn title_is_none_for_empty_html() {
544            let page = Page::from_servo(empty_servo_page());
545            assert!(page.title.is_none());
546        }
547
548        #[test]
549        fn inner_text_none_becomes_empty_string() {
550            let sp = empty_servo_page();
551            assert!(sp.inner_text.is_none());
552            let page = Page::from_servo(sp);
553            assert_eq!(page.inner_text, "");
554        }
555
556        #[test]
557        fn screenshot_is_encoded_as_png() {
558            let mut sp = empty_servo_page();
559            sp.screenshot = Some(synthetic_image(8, 8));
560            let page = Page::from_servo(sp);
561            let bytes = page.screenshot_png().expect("screenshot encoded");
562            assert_eq!(&bytes[..8], b"\x89PNG\r\n\x1a\n", "PNG magic bytes");
563        }
564
565        #[test]
566        fn console_messages_empty_by_default() {
567            let page = Page::from_servo(empty_servo_page());
568            assert!(page.console_messages.is_empty());
569        }
570
571        #[test]
572        fn console_messages_preserve_all_six_levels() {
573            let cases = [
574                (bridge::ConsoleLevel::Log, ConsoleLevel::Log),
575                (bridge::ConsoleLevel::Debug, ConsoleLevel::Debug),
576                (bridge::ConsoleLevel::Info, ConsoleLevel::Info),
577                (bridge::ConsoleLevel::Warn, ConsoleLevel::Warn),
578                (bridge::ConsoleLevel::Error, ConsoleLevel::Error),
579                (bridge::ConsoleLevel::Trace, ConsoleLevel::Trace),
580            ];
581            for (src, expected) in cases {
582                let mut sp = empty_servo_page();
583                sp.console_messages = vec![bridge::ConsoleMessage {
584                    level: src,
585                    message: "msg".into(),
586                }];
587                let page = Page::from_servo(sp);
588                assert_eq!(
589                    page.console_messages.len(),
590                    1,
591                    "console message lost for source level {src:?}",
592                );
593                assert_eq!(
594                    page.console_messages[0].level, expected,
595                    "level mapping wrong for source {src:?}",
596                );
597            }
598        }
599
600        #[test]
601        fn console_messages_preserve_ordering_across_levels() {
602            let mut sp = empty_servo_page();
603            sp.console_messages = vec![
604                bridge::ConsoleMessage {
605                    level: bridge::ConsoleLevel::Info,
606                    message: "first".into(),
607                },
608                bridge::ConsoleMessage {
609                    level: bridge::ConsoleLevel::Error,
610                    message: "second".into(),
611                },
612                bridge::ConsoleMessage {
613                    level: bridge::ConsoleLevel::Warn,
614                    message: "third".into(),
615                },
616            ];
617            let page = Page::from_servo(sp);
618            assert_eq!(page.console_messages.len(), 3);
619            assert_eq!(page.console_messages[0].message, "first");
620            assert_eq!(page.console_messages[1].message, "second");
621            assert_eq!(page.console_messages[2].message, "third");
622            assert_eq!(page.console_messages[0].level, ConsoleLevel::Info);
623            assert_eq!(page.console_messages[1].level, ConsoleLevel::Error);
624            assert_eq!(page.console_messages[2].level, ConsoleLevel::Warn);
625        }
626
627        #[test]
628        fn extracted_starts_as_none_until_schema_applied() {
629            let page = Page::from_servo(empty_servo_page());
630            assert!(page.extracted.is_none());
631        }
632
633        #[test]
634        fn full_round_trip_preserves_every_field() {
635            let sp = bridge::ServoPage {
636                html: "<html><head><title>T</title></head><body>B</body></html>".into(),
637                inner_text: Some("B".into()),
638                layout_json: Some("[]".into()),
639                visibility_json: Some("[]".into()),
640                screenshot: Some(synthetic_image(2, 2)),
641                js_result: Some("42".into()),
642                accessibility_tree: Some("{}".into()),
643                a11y: None,
644                console_messages: vec![bridge::ConsoleMessage {
645                    level: bridge::ConsoleLevel::Log,
646                    message: "x".into(),
647                }],
648            };
649            let page = Page::from_servo(sp);
650            assert_eq!(page.html, "<html><head><title>T</title></head><body>B</body></html>");
651            assert_eq!(page.inner_text, "B");
652            assert_eq!(page.title.as_deref(), Some("T"));
653            assert_eq!(page.layout_json.as_deref(), Some("[]"));
654            assert_eq!(page.js_result.as_deref(), Some("42"));
655            assert_eq!(page.accessibility_tree.as_deref(), Some("{}"));
656            assert_eq!(page.console_messages.len(), 1);
657            assert!(page.screenshot_png().is_some());
658            assert!(page.extracted.is_none());
659        }
660    }
661}