Skip to main content

servo_fetch/
fetch.rs

1//! Single-page fetching and rendered content extraction.
2
3use std::time::Duration;
4
5use crate::error::Error;
6use crate::net::sanitize_user_agent;
7
8/// Rendered page returned by [`fetch`].
9#[derive(Debug, Clone, Default, serde::Serialize)]
10#[non_exhaustive]
11pub struct Page {
12    /// Fully rendered HTML after JavaScript execution.
13    pub html: String,
14    /// Plain text content (`document.body.innerText`).
15    pub inner_text: String,
16    /// Page title extracted from `<title>` tag.
17    pub title: Option<String>,
18    /// Parsed layout data from the injected CSS heuristics script.
19    #[serde(skip_serializing_if = "Option::is_none")]
20    pub layout_json: Option<String>,
21    /// Result of JavaScript evaluation, if [`FetchOptions::javascript`] was used.
22    #[serde(skip_serializing_if = "Option::is_none")]
23    pub js_result: Option<String>,
24    /// Browser console messages captured during page load.
25    pub console_messages: Vec<ConsoleMessage>,
26    /// Accessibility tree (AccessKit), if requested.
27    #[serde(skip_serializing_if = "Option::is_none")]
28    pub accessibility_tree: Option<String>,
29    #[serde(skip)]
30    screenshot_png: Option<Vec<u8>>,
31}
32
33impl Page {
34    /// Extract readable Markdown from this page.
35    pub fn markdown(&self) -> crate::error::Result<String> {
36        self.markdown_with_url("")
37    }
38
39    /// Extract readable Markdown, using the original URL for link resolution.
40    pub fn markdown_with_url(&self, url: &str) -> crate::error::Result<String> {
41        let input = crate::extract::ExtractInput::new(&self.html, url)
42            .with_layout_json(self.layout_json.as_deref())
43            .with_inner_text(Some(&self.inner_text));
44        Ok(crate::extract::extract_text(&input)?)
45    }
46
47    /// Extract structured JSON from this page.
48    pub fn extract_json(&self) -> crate::error::Result<String> {
49        self.extract_json_with_url("")
50    }
51
52    /// Extract structured JSON, using the original URL for link resolution.
53    pub fn extract_json_with_url(&self, url: &str) -> crate::error::Result<String> {
54        let input = crate::extract::ExtractInput::new(&self.html, url)
55            .with_layout_json(self.layout_json.as_deref())
56            .with_inner_text(Some(&self.inner_text));
57        Ok(crate::extract::extract_json(&input)?)
58    }
59
60    /// Extract readable Markdown from the subtree matched by a CSS selector.
61    pub fn markdown_with_selector(&self, url: &str, selector: &str) -> crate::error::Result<String> {
62        let input = crate::extract::ExtractInput::new(&self.html, url)
63            .with_layout_json(self.layout_json.as_deref())
64            .with_inner_text(Some(&self.inner_text))
65            .with_selector(Some(selector));
66        Ok(crate::extract::extract_text(&input)?)
67    }
68
69    /// Extract structured JSON from the subtree matched by a CSS selector.
70    pub fn extract_json_with_selector(&self, url: &str, selector: &str) -> crate::error::Result<String> {
71        let input = crate::extract::ExtractInput::new(&self.html, url)
72            .with_layout_json(self.layout_json.as_deref())
73            .with_inner_text(Some(&self.inner_text))
74            .with_selector(Some(selector));
75        Ok(crate::extract::extract_json(&input)?)
76    }
77
78    /// PNG screenshot bytes, if captured via [`FetchOptions::screenshot`].
79    #[must_use]
80    pub fn screenshot_png(&self) -> Option<&[u8]> {
81        self.screenshot_png.as_deref()
82    }
83
84    pub(crate) fn from_servo(page: crate::bridge::ServoPage) -> Self {
85        let title = {
86            let doc = dom_query::Document::from(page.html.as_str());
87            let t = doc.select("title").text().to_string();
88            if t.is_empty() { None } else { Some(t) }
89        };
90        let screenshot_png = page.screenshot.and_then(|img| {
91            let mut buf = std::io::Cursor::new(Vec::new());
92            img.write_to(&mut buf, image::ImageFormat::Png).ok()?;
93            Some(buf.into_inner())
94        });
95        Self {
96            html: page.html,
97            inner_text: page.inner_text.unwrap_or_default(),
98            title,
99            layout_json: page.layout_json,
100            js_result: page.js_result,
101            console_messages: page
102                .console_messages
103                .into_iter()
104                .map(|m| ConsoleMessage {
105                    level: match m.level {
106                        crate::bridge::ConsoleLevel::Log => ConsoleLevel::Log,
107                        crate::bridge::ConsoleLevel::Debug => ConsoleLevel::Debug,
108                        crate::bridge::ConsoleLevel::Info => ConsoleLevel::Info,
109                        crate::bridge::ConsoleLevel::Warn => ConsoleLevel::Warn,
110                        crate::bridge::ConsoleLevel::Error => ConsoleLevel::Error,
111                        crate::bridge::ConsoleLevel::Trace => ConsoleLevel::Trace,
112                    },
113                    message: m.message,
114                })
115                .collect(),
116            screenshot_png,
117            accessibility_tree: page.accessibility_tree,
118        }
119    }
120}
121
122/// Browser console message captured during page load.
123#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
124#[non_exhaustive]
125pub struct ConsoleMessage {
126    /// Severity level.
127    pub level: ConsoleLevel,
128    /// Message text.
129    pub message: String,
130}
131
132/// Console message severity.
133#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
134#[serde(rename_all = "lowercase")]
135#[non_exhaustive]
136pub enum ConsoleLevel {
137    /// General log message.
138    Log,
139    /// Debug-level message.
140    Debug,
141    /// Informational message.
142    Info,
143    /// Warning message.
144    Warn,
145    /// Error message.
146    Error,
147    /// Trace-level message.
148    Trace,
149}
150
151impl ConsoleLevel {
152    /// Returns the string representation of this level.
153    #[must_use]
154    pub fn as_str(&self) -> &'static str {
155        match self {
156            Self::Log => "log",
157            Self::Debug => "debug",
158            Self::Info => "info",
159            Self::Warn => "warn",
160            Self::Error => "error",
161            Self::Trace => "trace",
162        }
163    }
164}
165
166impl std::fmt::Display for ConsoleLevel {
167    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
168        f.pad(self.as_str())
169    }
170}
171
172#[derive(Debug, Clone, Default)]
173pub(crate) enum FetchMode {
174    #[default]
175    Content,
176    Screenshot {
177        full_page: bool,
178    },
179    JavaScript(String),
180}
181
182/// Options for a single page fetch.
183#[must_use = "options do nothing until passed to fetch()"]
184#[derive(Debug, Clone)]
185pub struct FetchOptions {
186    pub(crate) url: String,
187    pub(crate) timeout: Duration,
188    pub(crate) settle: Duration,
189    pub(crate) mode: FetchMode,
190    pub(crate) user_agent: Option<String>,
191}
192
193impl FetchOptions {
194    /// Fetch rendered content (default mode).
195    pub fn new(url: &str) -> Self {
196        Self {
197            url: url.into(),
198            timeout: Duration::from_secs(30),
199            settle: Duration::ZERO,
200            mode: FetchMode::Content,
201            user_agent: None,
202        }
203    }
204
205    /// Capture a PNG screenshot.
206    pub fn screenshot(url: &str, full_page: bool) -> Self {
207        Self {
208            mode: FetchMode::Screenshot { full_page },
209            ..Self::new(url)
210        }
211    }
212
213    /// Execute a JavaScript expression and return the result.
214    pub fn javascript(url: &str, expression: impl Into<String>) -> Self {
215        Self {
216            mode: FetchMode::JavaScript(expression.into()),
217            ..Self::new(url)
218        }
219    }
220
221    /// Page load timeout (default: 30s).
222    pub fn timeout(mut self, timeout: Duration) -> Self {
223        self.timeout = timeout;
224        self
225    }
226
227    /// Extra wait after load event for SPA hydration (default: 0).
228    pub fn settle(mut self, settle: Duration) -> Self {
229        self.settle = settle;
230        self
231    }
232
233    /// Override the User-Agent string for this request.
234    pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
235        self.user_agent = Some(sanitize_user_agent(ua.into()));
236        self
237    }
238}
239
240/// Fetch a single page via the embedded Servo engine.
241#[allow(clippy::needless_pass_by_value)]
242pub fn fetch(opts: FetchOptions) -> crate::error::Result<Page> {
243    crate::net::ensure_crypto_provider();
244
245    crate::net::validate_url(&opts.url)?;
246
247    if matches!(opts.mode, FetchMode::Content)
248        && let Some(bytes) = crate::pdf::probe(&opts.url, opts.timeout.as_secs().max(1))
249    {
250        let text = crate::extract::extract_pdf(&bytes);
251        return Ok(Page {
252            html: String::new(),
253            inner_text: text,
254            ..Page::default()
255        });
256    }
257
258    let bridge_opts = crate::bridge::FetchOptions {
259        url: &opts.url,
260        timeout_secs: opts.timeout.as_secs().max(1),
261        settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
262        user_agent: opts.user_agent.as_deref(),
263        mode: match opts.mode {
264            FetchMode::Content => crate::bridge::FetchMode::Content { include_a11y: false },
265            FetchMode::Screenshot { full_page } => crate::bridge::FetchMode::Screenshot { full_page },
266            FetchMode::JavaScript(ref expr) => crate::bridge::FetchMode::ExecuteJs {
267                expression: expr.clone(),
268            },
269        },
270    };
271
272    let servo_page = crate::bridge::fetch_page(bridge_opts).map_err(|e| {
273        let msg = format!("{e:#}");
274        if msg.contains("timed out") {
275            Error::Timeout {
276                url: opts.url.clone(),
277                timeout: opts.timeout,
278            }
279        } else {
280            Error::Engine(msg)
281        }
282    })?;
283
284    Ok(Page::from_servo(servo_page))
285}
286
287/// Fetch a URL and return readable Markdown.
288pub fn markdown(url: &str) -> crate::error::Result<String> {
289    fetch(FetchOptions::new(url))?.markdown_with_url(url)
290}
291
292/// Fetch a URL and return structured JSON.
293pub fn extract_json(url: &str) -> crate::error::Result<String> {
294    fetch(FetchOptions::new(url))?.extract_json_with_url(url)
295}
296
297/// Fetch a URL and return plain text (`document.body.innerText`).
298pub fn text(url: &str) -> crate::error::Result<String> {
299    Ok(fetch(FetchOptions::new(url))?.inner_text)
300}
301
302#[cfg(test)]
303mod tests {
304    use super::*;
305
306    #[test]
307    fn fetch_options_defaults() {
308        let opts = FetchOptions::new("https://example.com");
309        assert_eq!(opts.url, "https://example.com");
310        assert_eq!(opts.timeout, Duration::from_secs(30));
311        assert_eq!(opts.settle, Duration::ZERO);
312        assert!(matches!(opts.mode, FetchMode::Content));
313    }
314
315    #[test]
316    fn fetch_options_screenshot() {
317        let opts = FetchOptions::screenshot("https://example.com", true);
318        assert!(matches!(opts.mode, FetchMode::Screenshot { full_page: true }));
319    }
320
321    #[test]
322    fn fetch_options_javascript() {
323        let opts = FetchOptions::javascript("https://example.com", "document.title");
324        assert!(matches!(opts.mode, FetchMode::JavaScript(ref e) if e == "document.title"));
325    }
326
327    #[test]
328    fn fetch_options_chaining() {
329        let opts = FetchOptions::new("https://example.com")
330            .timeout(Duration::from_secs(60))
331            .settle(Duration::from_millis(500));
332        assert_eq!(opts.timeout, Duration::from_secs(60));
333        assert_eq!(opts.settle, Duration::from_millis(500));
334    }
335
336    #[test]
337    fn fetch_user_agent_set() {
338        let opts = FetchOptions::new("https://example.com").user_agent("MyBot/1.0");
339        assert_eq!(opts.user_agent.as_deref(), Some("MyBot/1.0"));
340    }
341
342    #[test]
343    fn fetch_user_agent_default_is_none() {
344        let opts = FetchOptions::new("https://example.com");
345        assert!(opts.user_agent.is_none());
346    }
347
348    #[test]
349    fn fetch_user_agent_sanitizes_crlf() {
350        let opts = FetchOptions::new("https://example.com").user_agent("Bot\r\nX-Evil: yes");
351        assert_eq!(opts.user_agent.as_deref(), Some("Bot  X-Evil: yes"));
352    }
353
354    #[test]
355    fn fetch_user_agent_sanitizes_null() {
356        let opts = FetchOptions::new("https://example.com").user_agent("Bot\0/1.0");
357        assert_eq!(opts.user_agent.as_deref(), Some("Bot /1.0"));
358    }
359
360    #[test]
361    fn fetch_user_agent_empty_string() {
362        let opts = FetchOptions::new("https://example.com").user_agent("");
363        assert_eq!(opts.user_agent.as_deref(), Some(""));
364    }
365
366    #[test]
367    fn page_markdown_from_html() {
368        let page = Page {
369            html: "<html><head><title>Test</title></head><body><p>hello world</p></body></html>".into(),
370            inner_text: "hello world".into(),
371            ..Page::default()
372        };
373        let md = page.markdown().unwrap();
374        assert!(md.contains("hello world"));
375    }
376
377    #[test]
378    fn page_extract_json_produces_valid_json() {
379        let page = Page {
380            html: "<html><head><title>Test</title></head><body><p>content</p></body></html>".into(),
381            inner_text: "content".into(),
382            ..Page::default()
383        };
384        let json = page.extract_json().unwrap();
385        let _: serde_json::Value = serde_json::from_str(&json).expect("valid JSON");
386    }
387
388    #[test]
389    fn page_screenshot_png_none_by_default() {
390        let page = Page::default();
391        assert!(page.screenshot_png().is_none());
392    }
393
394    #[test]
395    fn page_markdown_with_selector_scopes_to_subtree() {
396        let page = Page {
397            html: "<html><body><article>keep</article><aside>drop</aside></body></html>".into(),
398            ..Page::default()
399        };
400        let md = page.markdown_with_selector("https://example.com", "article").unwrap();
401        assert!(md.contains("keep"));
402        assert!(!md.contains("drop"));
403    }
404
405    #[test]
406    fn page_extract_json_with_selector_includes_url() {
407        let page = Page {
408            html: "<html><body><article>scoped</article></body></html>".into(),
409            ..Page::default()
410        };
411        let json = page
412            .extract_json_with_selector("https://example.com/page", "article")
413            .unwrap();
414        let parsed: serde_json::Value = serde_json::from_str(&json).expect("valid JSON");
415        assert_eq!(parsed["url"].as_str(), Some("https://example.com/page"));
416        assert!(parsed["text_content"].as_str().unwrap().contains("scoped"));
417    }
418
419    #[test]
420    fn page_markdown_with_selector_no_match_returns_empty() {
421        let page = Page {
422            html: "<html><body><article>x</article></body></html>".into(),
423            ..Page::default()
424        };
425        let md = page.markdown_with_selector("", ".nonexistent").unwrap();
426        assert!(md.is_empty());
427    }
428
429    #[test]
430    fn page_markdown_with_invalid_selector_returns_error() {
431        let page = Page {
432            html: "<html><body><p>x</p></body></html>".into(),
433            ..Page::default()
434        };
435        let err = page.markdown_with_selector("", "###invalid[[[").unwrap_err();
436        assert!(err.to_string().contains("invalid CSS selector"));
437    }
438
439    #[test]
440    fn page_markdown_with_empty_selector_returns_error() {
441        let page = Page {
442            html: "<html><body><p>x</p></body></html>".into(),
443            ..Page::default()
444        };
445        assert!(page.markdown_with_selector("", "").is_err());
446    }
447
448    #[test]
449    fn fetch_rejects_invalid_url() {
450        let result = fetch(FetchOptions::new("not a url"));
451        assert!(result.is_err());
452        let err = result.unwrap_err();
453        assert!(matches!(err, Error::InvalidUrl { .. }));
454    }
455
456    #[test]
457    fn fetch_rejects_private_ip() {
458        let result = fetch(FetchOptions::new("http://127.0.0.1/"));
459        assert!(result.is_err());
460    }
461
462    #[test]
463    fn fetch_rejects_file_scheme() {
464        let result = fetch(FetchOptions::new("file:///etc/passwd"));
465        assert!(result.is_err());
466    }
467}