Skip to main content

servo_fetch/
fetch.rs

1//! Single-page fetching and rendered content extraction.
2
3use std::time::Duration;
4
5use crate::error::Error;
6use crate::net::sanitize_user_agent;
7
8/// Rendered page returned by [`fetch`].
9#[derive(Debug, Clone, Default, serde::Serialize)]
10#[non_exhaustive]
11pub struct Page {
12    /// Fully rendered HTML after JavaScript execution.
13    pub html: String,
14    /// Plain text content (`document.body.innerText`).
15    pub inner_text: String,
16    /// Page title extracted from `<title>` tag.
17    pub title: Option<String>,
18    /// Parsed layout data from the injected CSS heuristics script.
19    #[serde(skip_serializing_if = "Option::is_none")]
20    pub layout_json: Option<String>,
21    /// Result of JavaScript evaluation, if [`FetchOptions::javascript`] was used.
22    #[serde(skip_serializing_if = "Option::is_none")]
23    pub js_result: Option<String>,
24    /// Browser console messages captured during page load.
25    pub console_messages: Vec<ConsoleMessage>,
26    /// Accessibility tree (AccessKit), if requested.
27    #[serde(skip_serializing_if = "Option::is_none")]
28    pub accessibility_tree: Option<String>,
29    /// Structured data extracted via [`FetchOptions::schema`].
30    #[serde(skip_serializing_if = "Option::is_none")]
31    pub extracted: Option<serde_json::Value>,
32    #[serde(skip)]
33    screenshot_png: Option<Vec<u8>>,
34}
35
36impl Page {
37    /// Extract readable Markdown from this page.
38    pub fn markdown(&self) -> crate::error::Result<String> {
39        self.markdown_with_url("")
40    }
41
42    /// Extract readable Markdown, using the original URL for link resolution.
43    pub fn markdown_with_url(&self, url: &str) -> crate::error::Result<String> {
44        let input = crate::extract::ExtractInput::new(&self.html, url)
45            .with_layout_json(self.layout_json.as_deref())
46            .with_inner_text(Some(&self.inner_text));
47        Ok(crate::extract::extract_text(&input)?)
48    }
49
50    /// Extract structured JSON from this page.
51    pub fn extract_json(&self) -> crate::error::Result<String> {
52        self.extract_json_with_url("")
53    }
54
55    /// Extract structured JSON, using the original URL for link resolution.
56    pub fn extract_json_with_url(&self, url: &str) -> crate::error::Result<String> {
57        let input = crate::extract::ExtractInput::new(&self.html, url)
58            .with_layout_json(self.layout_json.as_deref())
59            .with_inner_text(Some(&self.inner_text));
60        Ok(crate::extract::extract_json(&input)?)
61    }
62
63    /// Extract readable Markdown from the subtree matched by a CSS selector.
64    pub fn markdown_with_selector(&self, url: &str, selector: &str) -> crate::error::Result<String> {
65        let input = crate::extract::ExtractInput::new(&self.html, url)
66            .with_layout_json(self.layout_json.as_deref())
67            .with_inner_text(Some(&self.inner_text))
68            .with_selector(Some(selector));
69        Ok(crate::extract::extract_text(&input)?)
70    }
71
72    /// Extract structured JSON from the subtree matched by a CSS selector.
73    pub fn extract_json_with_selector(&self, url: &str, selector: &str) -> crate::error::Result<String> {
74        let input = crate::extract::ExtractInput::new(&self.html, url)
75            .with_layout_json(self.layout_json.as_deref())
76            .with_inner_text(Some(&self.inner_text))
77            .with_selector(Some(selector));
78        Ok(crate::extract::extract_json(&input)?)
79    }
80
81    /// PNG screenshot bytes, if captured via [`FetchOptions::screenshot`].
82    #[must_use]
83    pub fn screenshot_png(&self) -> Option<&[u8]> {
84        self.screenshot_png.as_deref()
85    }
86
87    pub(crate) fn from_servo(page: crate::bridge::ServoPage) -> Self {
88        let title = {
89            let doc = dom_query::Document::from(page.html.as_str());
90            let t = doc.select("title").text().to_string();
91            if t.is_empty() { None } else { Some(t) }
92        };
93        let screenshot_png = page.screenshot.and_then(|img| {
94            let mut buf = std::io::Cursor::new(Vec::new());
95            img.write_to(&mut buf, image::ImageFormat::Png).ok()?;
96            Some(buf.into_inner())
97        });
98        Self {
99            html: page.html,
100            inner_text: page.inner_text.unwrap_or_default(),
101            title,
102            layout_json: page.layout_json,
103            js_result: page.js_result,
104            console_messages: page
105                .console_messages
106                .into_iter()
107                .map(|m| ConsoleMessage {
108                    level: match m.level {
109                        crate::bridge::ConsoleLevel::Log => ConsoleLevel::Log,
110                        crate::bridge::ConsoleLevel::Debug => ConsoleLevel::Debug,
111                        crate::bridge::ConsoleLevel::Info => ConsoleLevel::Info,
112                        crate::bridge::ConsoleLevel::Warn => ConsoleLevel::Warn,
113                        crate::bridge::ConsoleLevel::Error => ConsoleLevel::Error,
114                        crate::bridge::ConsoleLevel::Trace => ConsoleLevel::Trace,
115                    },
116                    message: m.message,
117                })
118                .collect(),
119            screenshot_png,
120            accessibility_tree: page.accessibility_tree,
121            extracted: None,
122        }
123    }
124}
125
126/// Browser console message captured during page load.
127#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
128#[non_exhaustive]
129pub struct ConsoleMessage {
130    /// Severity level.
131    pub level: ConsoleLevel,
132    /// Message text.
133    pub message: String,
134}
135
136/// Console message severity.
137#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
138#[serde(rename_all = "lowercase")]
139#[non_exhaustive]
140pub enum ConsoleLevel {
141    /// General log message.
142    Log,
143    /// Debug-level message.
144    Debug,
145    /// Informational message.
146    Info,
147    /// Warning message.
148    Warn,
149    /// Error message.
150    Error,
151    /// Trace-level message.
152    Trace,
153}
154
155impl ConsoleLevel {
156    /// Returns the string representation of this level.
157    #[must_use]
158    pub fn as_str(&self) -> &'static str {
159        match self {
160            Self::Log => "log",
161            Self::Debug => "debug",
162            Self::Info => "info",
163            Self::Warn => "warn",
164            Self::Error => "error",
165            Self::Trace => "trace",
166        }
167    }
168}
169
170impl std::fmt::Display for ConsoleLevel {
171    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
172        f.pad(self.as_str())
173    }
174}
175
176#[derive(Debug, Clone, Default)]
177pub(crate) enum FetchMode {
178    #[default]
179    Content,
180    Screenshot {
181        full_page: bool,
182    },
183    JavaScript(String),
184}
185
186/// Options for a single page fetch.
187#[must_use = "options do nothing until passed to fetch()"]
188#[derive(Debug, Clone)]
189pub struct FetchOptions {
190    pub(crate) url: String,
191    pub(crate) timeout: Duration,
192    pub(crate) settle: Duration,
193    pub(crate) mode: FetchMode,
194    pub(crate) user_agent: Option<String>,
195    pub(crate) extract_schema: Option<crate::schema::ExtractSchema>,
196}
197
198impl FetchOptions {
199    /// Fetch rendered content (default mode).
200    pub fn new(url: &str) -> Self {
201        Self {
202            url: url.into(),
203            timeout: Duration::from_secs(30),
204            settle: Duration::ZERO,
205            mode: FetchMode::Content,
206            user_agent: None,
207            extract_schema: None,
208        }
209    }
210
211    /// Capture a PNG screenshot.
212    pub fn screenshot(url: &str, full_page: bool) -> Self {
213        Self {
214            mode: FetchMode::Screenshot { full_page },
215            ..Self::new(url)
216        }
217    }
218
219    /// Execute a JavaScript expression and return the result.
220    pub fn javascript(url: &str, expression: impl Into<String>) -> Self {
221        Self {
222            mode: FetchMode::JavaScript(expression.into()),
223            ..Self::new(url)
224        }
225    }
226
227    /// Page load timeout (default: 30s).
228    pub fn timeout(mut self, timeout: Duration) -> Self {
229        self.timeout = timeout;
230        self
231    }
232
233    /// Extra wait after load event for SPA hydration (default: 0).
234    pub fn settle(mut self, settle: Duration) -> Self {
235        self.settle = settle;
236        self
237    }
238
239    /// Override the User-Agent string for this request.
240    pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
241        self.user_agent = Some(sanitize_user_agent(ua.into()));
242        self
243    }
244
245    /// Extract structured data from the rendered page using the given schema.
246    pub fn schema(mut self, schema: crate::schema::ExtractSchema) -> Self {
247        self.extract_schema = Some(schema);
248        self
249    }
250}
251
252/// Fetch a single page via the embedded Servo engine.
253#[allow(clippy::needless_pass_by_value)]
254pub fn fetch(opts: FetchOptions) -> crate::error::Result<Page> {
255    crate::net::ensure_crypto_provider();
256
257    crate::net::validate_url(&opts.url)?;
258
259    if matches!(opts.mode, FetchMode::Content)
260        && let Some(bytes) = crate::pdf::probe(&opts.url, opts.timeout.as_secs().max(1))
261    {
262        let text = crate::extract::extract_pdf(&bytes);
263        return Ok(Page {
264            html: String::new(),
265            inner_text: text,
266            ..Page::default()
267        });
268    }
269
270    let bridge_opts = crate::bridge::FetchOptions {
271        url: &opts.url,
272        timeout_secs: opts.timeout.as_secs().max(1),
273        settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
274        user_agent: opts.user_agent.as_deref(),
275        mode: match opts.mode {
276            FetchMode::Content => crate::bridge::FetchMode::Content { include_a11y: false },
277            FetchMode::Screenshot { full_page } => crate::bridge::FetchMode::Screenshot { full_page },
278            FetchMode::JavaScript(ref expr) => crate::bridge::FetchMode::ExecuteJs {
279                expression: expr.clone(),
280            },
281        },
282    };
283
284    let servo_page = crate::bridge::fetch_page(bridge_opts).map_err(|e| {
285        let msg = format!("{e:#}");
286        if msg.contains("timed out") {
287            Error::Timeout {
288                url: opts.url.clone(),
289                timeout: opts.timeout,
290            }
291        } else {
292            Error::Engine(msg)
293        }
294    })?;
295
296    let mut page = Page::from_servo(servo_page);
297    if let Some(schema) = opts.extract_schema.as_ref() {
298        page.extracted = Some(schema.extract_from(&page.html));
299    }
300    Ok(page)
301}
302
303/// Fetch a URL and return readable Markdown.
304pub fn markdown(url: &str) -> crate::error::Result<String> {
305    fetch(FetchOptions::new(url))?.markdown_with_url(url)
306}
307
308/// Fetch a URL and return structured JSON.
309pub fn extract_json(url: &str) -> crate::error::Result<String> {
310    fetch(FetchOptions::new(url))?.extract_json_with_url(url)
311}
312
313/// Fetch a URL and return plain text (`document.body.innerText`).
314pub fn text(url: &str) -> crate::error::Result<String> {
315    Ok(fetch(FetchOptions::new(url))?.inner_text)
316}
317
318#[cfg(test)]
319mod tests {
320    use super::*;
321
322    #[test]
323    fn fetch_options_defaults() {
324        let opts = FetchOptions::new("https://example.com");
325        assert_eq!(opts.url, "https://example.com");
326        assert_eq!(opts.timeout, Duration::from_secs(30));
327        assert_eq!(opts.settle, Duration::ZERO);
328        assert!(matches!(opts.mode, FetchMode::Content));
329    }
330
331    #[test]
332    fn fetch_options_screenshot() {
333        let opts = FetchOptions::screenshot("https://example.com", true);
334        assert!(matches!(opts.mode, FetchMode::Screenshot { full_page: true }));
335    }
336
337    #[test]
338    fn fetch_options_javascript() {
339        let opts = FetchOptions::javascript("https://example.com", "document.title");
340        assert!(matches!(opts.mode, FetchMode::JavaScript(ref e) if e == "document.title"));
341    }
342
343    #[test]
344    fn fetch_options_chaining() {
345        let opts = FetchOptions::new("https://example.com")
346            .timeout(Duration::from_secs(60))
347            .settle(Duration::from_millis(500));
348        assert_eq!(opts.timeout, Duration::from_secs(60));
349        assert_eq!(opts.settle, Duration::from_millis(500));
350    }
351
352    #[test]
353    fn fetch_user_agent_set() {
354        let opts = FetchOptions::new("https://example.com").user_agent("MyBot/1.0");
355        assert_eq!(opts.user_agent.as_deref(), Some("MyBot/1.0"));
356    }
357
358    #[test]
359    fn fetch_user_agent_default_is_none() {
360        let opts = FetchOptions::new("https://example.com");
361        assert!(opts.user_agent.is_none());
362    }
363
364    #[test]
365    fn fetch_user_agent_sanitizes_crlf() {
366        let opts = FetchOptions::new("https://example.com").user_agent("Bot\r\nX-Evil: yes");
367        assert_eq!(opts.user_agent.as_deref(), Some("Bot  X-Evil: yes"));
368    }
369
370    #[test]
371    fn fetch_user_agent_sanitizes_null() {
372        let opts = FetchOptions::new("https://example.com").user_agent("Bot\0/1.0");
373        assert_eq!(opts.user_agent.as_deref(), Some("Bot /1.0"));
374    }
375
376    #[test]
377    fn fetch_user_agent_empty_string() {
378        let opts = FetchOptions::new("https://example.com").user_agent("");
379        assert_eq!(opts.user_agent.as_deref(), Some(""));
380    }
381
382    #[test]
383    fn page_markdown_from_html() {
384        let page = Page {
385            html: "<html><head><title>Test</title></head><body><p>hello world</p></body></html>".into(),
386            inner_text: "hello world".into(),
387            ..Page::default()
388        };
389        let md = page.markdown().unwrap();
390        assert!(md.contains("hello world"));
391    }
392
393    #[test]
394    fn page_extract_json_produces_valid_json() {
395        let page = Page {
396            html: "<html><head><title>Test</title></head><body><p>content</p></body></html>".into(),
397            inner_text: "content".into(),
398            ..Page::default()
399        };
400        let json = page.extract_json().unwrap();
401        let _: serde_json::Value = serde_json::from_str(&json).expect("valid JSON");
402    }
403
404    #[test]
405    fn page_screenshot_png_none_by_default() {
406        let page = Page::default();
407        assert!(page.screenshot_png().is_none());
408    }
409
410    #[test]
411    fn page_markdown_with_selector_scopes_to_subtree() {
412        let page = Page {
413            html: "<html><body><article>keep</article><aside>drop</aside></body></html>".into(),
414            ..Page::default()
415        };
416        let md = page.markdown_with_selector("https://example.com", "article").unwrap();
417        assert!(md.contains("keep"));
418        assert!(!md.contains("drop"));
419    }
420
421    #[test]
422    fn page_extract_json_with_selector_includes_url() {
423        let page = Page {
424            html: "<html><body><article>scoped</article></body></html>".into(),
425            ..Page::default()
426        };
427        let json = page
428            .extract_json_with_selector("https://example.com/page", "article")
429            .unwrap();
430        let parsed: serde_json::Value = serde_json::from_str(&json).expect("valid JSON");
431        assert_eq!(parsed["url"].as_str(), Some("https://example.com/page"));
432        assert!(parsed["text_content"].as_str().unwrap().contains("scoped"));
433    }
434
435    #[test]
436    fn page_markdown_with_selector_no_match_returns_empty() {
437        let page = Page {
438            html: "<html><body><article>x</article></body></html>".into(),
439            ..Page::default()
440        };
441        let md = page.markdown_with_selector("", ".nonexistent").unwrap();
442        assert!(md.is_empty());
443    }
444
445    #[test]
446    fn page_markdown_with_invalid_selector_returns_error() {
447        let page = Page {
448            html: "<html><body><p>x</p></body></html>".into(),
449            ..Page::default()
450        };
451        let err = page.markdown_with_selector("", "###invalid[[[").unwrap_err();
452        assert!(err.to_string().contains("invalid CSS selector"));
453    }
454
455    #[test]
456    fn page_markdown_with_empty_selector_returns_error() {
457        let page = Page {
458            html: "<html><body><p>x</p></body></html>".into(),
459            ..Page::default()
460        };
461        assert!(page.markdown_with_selector("", "").is_err());
462    }
463
464    #[test]
465    fn fetch_rejects_invalid_url() {
466        let result = fetch(FetchOptions::new("not a url"));
467        assert!(result.is_err());
468        let err = result.unwrap_err();
469        assert!(matches!(err, Error::InvalidUrl { .. }));
470    }
471
472    #[test]
473    fn fetch_rejects_private_ip() {
474        let result = fetch(FetchOptions::new("http://127.0.0.1/"));
475        assert!(result.is_err());
476    }
477
478    #[test]
479    fn fetch_rejects_file_scheme() {
480        let result = fetch(FetchOptions::new("file:///etc/passwd"));
481        assert!(result.is_err());
482    }
483
484    mod page_from_servo {
485        use crate::bridge;
486        use crate::fetch::{ConsoleLevel, Page};
487
488        fn synthetic_image(w: u32, h: u32) -> image::RgbaImage {
489            image::RgbaImage::from_pixel(w, h, image::Rgba([255, 0, 0, 255]))
490        }
491
492        fn empty_servo_page() -> bridge::ServoPage {
493            bridge::ServoPage::default()
494        }
495
496        #[test]
497        fn extracts_title_from_html() {
498            let mut sp = empty_servo_page();
499            sp.html = "<html><head><title>Hello World</title></head></html>".into();
500            let page = Page::from_servo(sp);
501            assert_eq!(page.title.as_deref(), Some("Hello World"));
502        }
503
504        #[test]
505        fn title_is_none_when_tag_missing() {
506            let mut sp = empty_servo_page();
507            sp.html = "<html><body>no title here</body></html>".into();
508            let page = Page::from_servo(sp);
509            assert!(page.title.is_none());
510        }
511
512        #[test]
513        fn title_is_none_when_tag_empty() {
514            let mut sp = empty_servo_page();
515            sp.html = "<html><head><title></title></head></html>".into();
516            let page = Page::from_servo(sp);
517            assert!(page.title.is_none());
518        }
519
520        #[test]
521        fn title_is_none_for_empty_html() {
522            let page = Page::from_servo(empty_servo_page());
523            assert!(page.title.is_none());
524        }
525
526        #[test]
527        fn inner_text_none_becomes_empty_string() {
528            let sp = empty_servo_page();
529            assert!(sp.inner_text.is_none());
530            let page = Page::from_servo(sp);
531            assert_eq!(page.inner_text, "");
532        }
533
534        #[test]
535        fn screenshot_is_encoded_as_png() {
536            let mut sp = empty_servo_page();
537            sp.screenshot = Some(synthetic_image(8, 8));
538            let page = Page::from_servo(sp);
539            let bytes = page.screenshot_png().expect("screenshot encoded");
540            assert_eq!(&bytes[..8], b"\x89PNG\r\n\x1a\n", "PNG magic bytes");
541        }
542
543        #[test]
544        fn console_messages_empty_by_default() {
545            let page = Page::from_servo(empty_servo_page());
546            assert!(page.console_messages.is_empty());
547        }
548
549        #[test]
550        fn console_messages_preserve_all_six_levels() {
551            let cases = [
552                (bridge::ConsoleLevel::Log, ConsoleLevel::Log),
553                (bridge::ConsoleLevel::Debug, ConsoleLevel::Debug),
554                (bridge::ConsoleLevel::Info, ConsoleLevel::Info),
555                (bridge::ConsoleLevel::Warn, ConsoleLevel::Warn),
556                (bridge::ConsoleLevel::Error, ConsoleLevel::Error),
557                (bridge::ConsoleLevel::Trace, ConsoleLevel::Trace),
558            ];
559            for (src, expected) in cases {
560                let mut sp = empty_servo_page();
561                sp.console_messages = vec![bridge::ConsoleMessage {
562                    level: src,
563                    message: "msg".into(),
564                }];
565                let page = Page::from_servo(sp);
566                assert_eq!(
567                    page.console_messages.len(),
568                    1,
569                    "console message lost for source level {src:?}",
570                );
571                assert_eq!(
572                    page.console_messages[0].level, expected,
573                    "level mapping wrong for source {src:?}",
574                );
575            }
576        }
577
578        #[test]
579        fn console_messages_preserve_ordering_across_levels() {
580            let mut sp = empty_servo_page();
581            sp.console_messages = vec![
582                bridge::ConsoleMessage {
583                    level: bridge::ConsoleLevel::Info,
584                    message: "first".into(),
585                },
586                bridge::ConsoleMessage {
587                    level: bridge::ConsoleLevel::Error,
588                    message: "second".into(),
589                },
590                bridge::ConsoleMessage {
591                    level: bridge::ConsoleLevel::Warn,
592                    message: "third".into(),
593                },
594            ];
595            let page = Page::from_servo(sp);
596            assert_eq!(page.console_messages.len(), 3);
597            assert_eq!(page.console_messages[0].message, "first");
598            assert_eq!(page.console_messages[1].message, "second");
599            assert_eq!(page.console_messages[2].message, "third");
600            assert_eq!(page.console_messages[0].level, ConsoleLevel::Info);
601            assert_eq!(page.console_messages[1].level, ConsoleLevel::Error);
602            assert_eq!(page.console_messages[2].level, ConsoleLevel::Warn);
603        }
604
605        #[test]
606        fn extracted_starts_as_none_until_schema_applied() {
607            let page = Page::from_servo(empty_servo_page());
608            assert!(page.extracted.is_none());
609        }
610
611        #[test]
612        fn full_round_trip_preserves_every_field() {
613            let sp = bridge::ServoPage {
614                html: "<html><head><title>T</title></head><body>B</body></html>".into(),
615                inner_text: Some("B".into()),
616                layout_json: Some("[]".into()),
617                screenshot: Some(synthetic_image(2, 2)),
618                js_result: Some("42".into()),
619                accessibility_tree: Some("{}".into()),
620                console_messages: vec![bridge::ConsoleMessage {
621                    level: bridge::ConsoleLevel::Log,
622                    message: "x".into(),
623                }],
624            };
625            let page = Page::from_servo(sp);
626            assert_eq!(page.html, "<html><head><title>T</title></head><body>B</body></html>");
627            assert_eq!(page.inner_text, "B");
628            assert_eq!(page.title.as_deref(), Some("T"));
629            assert_eq!(page.layout_json.as_deref(), Some("[]"));
630            assert_eq!(page.js_result.as_deref(), Some("42"));
631            assert_eq!(page.accessibility_tree.as_deref(), Some("{}"));
632            assert_eq!(page.console_messages.len(), 1);
633            assert!(page.screenshot_png().is_some());
634            assert!(page.extracted.is_none());
635        }
636    }
637}