Skip to main content

servo_fetch/
engine.rs

1//! Servo browser engine facade.
2
3use std::time::Duration;
4
5use crate::error::Error;
6
7/// Rendered page returned by [`fetch`].
8#[derive(Debug, Clone, Default, serde::Serialize)]
9#[non_exhaustive]
10pub struct Page {
11    /// Fully rendered HTML after JavaScript execution.
12    pub html: String,
13    /// Plain text content (`document.body.innerText`).
14    pub inner_text: String,
15    /// Page title extracted from `<title>` tag.
16    pub title: Option<String>,
17    /// Parsed layout data from the injected CSS heuristics script.
18    #[serde(skip_serializing_if = "Option::is_none")]
19    pub layout_json: Option<String>,
20    /// Result of JavaScript evaluation, if [`FetchOptions::javascript`] was used.
21    #[serde(skip_serializing_if = "Option::is_none")]
22    pub js_result: Option<String>,
23    /// Browser console messages captured during page load.
24    pub console_messages: Vec<ConsoleMessage>,
25    /// Accessibility tree (AccessKit), if requested.
26    #[serde(skip_serializing_if = "Option::is_none")]
27    pub accessibility_tree: Option<String>,
28    #[serde(skip)]
29    screenshot_png: Option<Vec<u8>>,
30}
31
32impl Page {
33    /// Extract readable Markdown from this page.
34    pub fn markdown(&self) -> crate::error::Result<String> {
35        self.markdown_with_url("")
36    }
37
38    /// Extract readable Markdown, using the original URL for link resolution.
39    pub fn markdown_with_url(&self, url: &str) -> crate::error::Result<String> {
40        let input = crate::extract::ExtractInput::new(&self.html, url)
41            .with_layout_json(self.layout_json.as_deref())
42            .with_inner_text(Some(&self.inner_text));
43        Ok(crate::extract::extract_text(&input)?)
44    }
45
46    /// Extract structured JSON from this page.
47    pub fn extract_json(&self) -> crate::error::Result<String> {
48        self.extract_json_with_url("")
49    }
50
51    /// Extract structured JSON, using the original URL for link resolution.
52    pub fn extract_json_with_url(&self, url: &str) -> crate::error::Result<String> {
53        let input = crate::extract::ExtractInput::new(&self.html, url)
54            .with_layout_json(self.layout_json.as_deref())
55            .with_inner_text(Some(&self.inner_text));
56        Ok(crate::extract::extract_json(&input)?)
57    }
58
59    /// PNG screenshot bytes, if captured via [`FetchOptions::screenshot`].
60    #[must_use]
61    pub fn screenshot_png(&self) -> Option<&[u8]> {
62        self.screenshot_png.as_deref()
63    }
64
65    pub(crate) fn from_servo(page: crate::bridge::ServoPage) -> Self {
66        let title = {
67            let doc = dom_query::Document::from(page.html.as_str());
68            let t = doc.select("title").text().to_string();
69            if t.is_empty() { None } else { Some(t) }
70        };
71        let screenshot_png = page.screenshot.and_then(|img| {
72            let mut buf = std::io::Cursor::new(Vec::new());
73            img.write_to(&mut buf, image::ImageFormat::Png).ok()?;
74            Some(buf.into_inner())
75        });
76        Self {
77            html: page.html,
78            inner_text: page.inner_text.unwrap_or_default(),
79            title,
80            layout_json: page.layout_json,
81            js_result: page.js_result,
82            console_messages: page
83                .console_messages
84                .into_iter()
85                .map(|m| ConsoleMessage {
86                    level: match m.level {
87                        crate::bridge::ConsoleLevel::Log => ConsoleLevel::Log,
88                        crate::bridge::ConsoleLevel::Debug => ConsoleLevel::Debug,
89                        crate::bridge::ConsoleLevel::Info => ConsoleLevel::Info,
90                        crate::bridge::ConsoleLevel::Warn => ConsoleLevel::Warn,
91                        crate::bridge::ConsoleLevel::Error => ConsoleLevel::Error,
92                        crate::bridge::ConsoleLevel::Trace => ConsoleLevel::Trace,
93                    },
94                    message: m.message,
95                })
96                .collect(),
97            screenshot_png,
98            accessibility_tree: page.accessibility_tree,
99        }
100    }
101}
102
103/// Browser console message captured during page load.
104#[derive(Debug, Clone, serde::Serialize)]
105#[non_exhaustive]
106pub struct ConsoleMessage {
107    /// Severity level.
108    pub level: ConsoleLevel,
109    /// Message text.
110    pub message: String,
111}
112
113/// Console message severity.
114#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
115#[serde(rename_all = "lowercase")]
116#[non_exhaustive]
117pub enum ConsoleLevel {
118    /// General log message.
119    Log,
120    /// Debug-level message.
121    Debug,
122    /// Informational message.
123    Info,
124    /// Warning message.
125    Warn,
126    /// Error message.
127    Error,
128    /// Trace-level message.
129    Trace,
130}
131
132#[derive(Debug, Clone, Default)]
133pub(crate) enum FetchMode {
134    #[default]
135    Content,
136    Screenshot {
137        full_page: bool,
138    },
139    JavaScript(String),
140}
141
142/// Options for a single page fetch.
143///
144/// # Thread Safety
145///
146/// [`fetch`] is safe to call from multiple threads. Each call queues a request
147/// to the shared Servo engine thread, which processes them sequentially.
148#[must_use = "options do nothing until passed to fetch()"]
149#[derive(Debug, Clone)]
150pub struct FetchOptions {
151    pub(crate) url: String,
152    pub(crate) timeout: Duration,
153    pub(crate) settle: Duration,
154    pub(crate) mode: FetchMode,
155}
156
157impl FetchOptions {
158    /// Fetch rendered content (default mode).
159    pub fn new(url: &str) -> Self {
160        Self {
161            url: url.into(),
162            timeout: Duration::from_secs(30),
163            settle: Duration::ZERO,
164            mode: FetchMode::Content,
165        }
166    }
167
168    /// Capture a PNG screenshot.
169    pub fn screenshot(url: &str, full_page: bool) -> Self {
170        Self {
171            mode: FetchMode::Screenshot { full_page },
172            ..Self::new(url)
173        }
174    }
175
176    /// Execute a JavaScript expression and return the result.
177    pub fn javascript(url: &str, expression: impl Into<String>) -> Self {
178        Self {
179            mode: FetchMode::JavaScript(expression.into()),
180            ..Self::new(url)
181        }
182    }
183
184    /// Page load timeout (default: 30s).
185    pub fn timeout(mut self, timeout: Duration) -> Self {
186        self.timeout = timeout;
187        self
188    }
189
190    /// Extra wait after load event for SPA hydration (default: 0).
191    pub fn settle(mut self, settle: Duration) -> Self {
192        self.settle = settle;
193        self
194    }
195}
196
197/// Fetch a single page via the embedded Servo engine.
198///
199/// The first call spawns a persistent engine thread that lives for the process
200/// lifetime. If the engine thread panics, this returns [`Error::Engine`].
201#[allow(clippy::needless_pass_by_value)]
202pub fn fetch(opts: FetchOptions) -> crate::error::Result<Page> {
203    crate::net::validate_url(&opts.url).map_err(|e| Error::InvalidUrl {
204        url: opts.url.clone(),
205        reason: e.to_string(),
206    })?;
207
208    if matches!(opts.mode, FetchMode::Content)
209        && let Some(bytes) = crate::pdf::probe(&opts.url, opts.timeout.as_secs().max(1))
210    {
211        let text = crate::extract::extract_pdf(&bytes);
212        return Ok(Page {
213            html: String::new(),
214            inner_text: text,
215            ..Page::default()
216        });
217    }
218
219    let bridge_opts = crate::bridge::FetchOptions {
220        url: &opts.url,
221        timeout_secs: opts.timeout.as_secs().max(1),
222        settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
223        mode: match opts.mode {
224            FetchMode::Content => crate::bridge::FetchMode::Content { include_a11y: false },
225            FetchMode::Screenshot { full_page } => crate::bridge::FetchMode::Screenshot { full_page },
226            FetchMode::JavaScript(ref expr) => crate::bridge::FetchMode::ExecuteJs {
227                expression: expr.clone(),
228            },
229        },
230    };
231
232    let servo_page = crate::bridge::fetch_page(bridge_opts).map_err(|e| {
233        let msg = format!("{e:#}");
234        if msg.contains("timed out") {
235            Error::Timeout {
236                url: opts.url.clone(),
237                timeout: opts.timeout,
238            }
239        } else {
240            Error::Engine(msg)
241        }
242    })?;
243
244    Ok(Page::from_servo(servo_page))
245}
246
247/// Options for crawling a site.
248#[must_use = "options do nothing until passed to crawl() or crawl_each()"]
249#[derive(Debug, Clone)]
250pub struct CrawlOptions {
251    pub(crate) url: String,
252    pub(crate) limit: usize,
253    pub(crate) max_depth: usize,
254    pub(crate) timeout: Duration,
255    pub(crate) settle: Duration,
256    pub(crate) include: Vec<String>,
257    pub(crate) exclude: Vec<String>,
258    pub(crate) selector: Option<String>,
259    pub(crate) json: bool,
260}
261
262impl CrawlOptions {
263    /// Create crawl options for the given seed URL.
264    pub fn new(url: &str) -> Self {
265        Self {
266            url: url.into(),
267            limit: 50,
268            max_depth: 3,
269            timeout: Duration::from_secs(30),
270            settle: Duration::ZERO,
271            include: Vec::new(),
272            exclude: Vec::new(),
273            selector: None,
274            json: false,
275        }
276    }
277
278    /// Maximum number of pages to crawl (default: 50).
279    pub fn limit(mut self, n: usize) -> Self {
280        self.limit = n;
281        self
282    }
283
284    /// Maximum link depth from the seed URL (default: 3).
285    pub fn max_depth(mut self, n: usize) -> Self {
286        self.max_depth = n;
287        self
288    }
289
290    /// Page load timeout per page (default: 30s).
291    pub fn timeout(mut self, timeout: Duration) -> Self {
292        self.timeout = timeout;
293        self
294    }
295
296    /// Extra wait after load event per page (default: 0).
297    pub fn settle(mut self, settle: Duration) -> Self {
298        self.settle = settle;
299        self
300    }
301
302    /// URL path glob patterns to include (e.g. `"/docs/**"`).
303    pub fn include(mut self, patterns: &[&str]) -> Self {
304        self.include = patterns.iter().map(|s| (*s).to_string()).collect();
305        self
306    }
307
308    /// URL path glob patterns to exclude (e.g. `"/docs/archive/**"`).
309    pub fn exclude(mut self, patterns: &[&str]) -> Self {
310        self.exclude = patterns.iter().map(|s| (*s).to_string()).collect();
311        self
312    }
313
314    /// Output crawled content as JSON instead of Markdown.
315    pub fn json(mut self, json: bool) -> Self {
316        self.json = json;
317        self
318    }
319
320    /// CSS selector to extract a specific section per page.
321    pub fn selector(mut self, selector: impl Into<String>) -> Self {
322        self.selector = Some(selector.into());
323        self
324    }
325}
326
327/// Result for a single crawled page.
328#[derive(Debug, Clone, serde::Serialize)]
329#[non_exhaustive]
330pub struct CrawlResult {
331    /// URL of the crawled page.
332    pub url: String,
333    /// Link depth from the seed URL.
334    pub depth: usize,
335    /// Whether the page was fetched successfully.
336    pub status: CrawlStatus,
337    /// Page title, if extraction succeeded.
338    #[serde(skip_serializing_if = "Option::is_none")]
339    pub title: Option<String>,
340    /// Extracted content (Markdown or JSON depending on options).
341    #[serde(skip_serializing_if = "Option::is_none")]
342    pub content: Option<String>,
343    /// Error message, if the page failed to load.
344    #[serde(skip_serializing_if = "Option::is_none")]
345    pub error: Option<String>,
346    /// Number of links discovered on this page.
347    pub links_found: usize,
348}
349
350impl CrawlResult {
351    fn from_internal(r: &crate::crawl::CrawlPageResult) -> Self {
352        Self {
353            url: r.url.clone(),
354            depth: r.depth,
355            status: match r.status {
356                crate::crawl::CrawlStatus::Ok => CrawlStatus::Ok,
357                crate::crawl::CrawlStatus::Error => CrawlStatus::Error,
358            },
359            title: r.title.clone(),
360            content: r.content.clone(),
361            error: r.error.clone(),
362            links_found: r.links_found,
363        }
364    }
365}
366
367/// Status of a crawled page.
368#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
369#[serde(rename_all = "lowercase")]
370#[non_exhaustive]
371pub enum CrawlStatus {
372    /// Page fetched and extracted successfully.
373    Ok,
374    /// Page failed to load or extract.
375    Error,
376}
377
378/// Crawl a site, invoking `on_page` for each result as it arrives.
379#[allow(clippy::needless_pass_by_value)]
380pub fn crawl_each(opts: CrawlOptions, mut on_page: impl FnMut(&CrawlResult)) -> crate::error::Result<()> {
381    let internal_opts = build_crawl_options(&opts)?;
382    crate::runtime::block_on(crate::crawl::run(internal_opts, |r| {
383        on_page(&CrawlResult::from_internal(r));
384    }))
385    .map_err(|e| Error::Engine(e.to_string()))?;
386    Ok(())
387}
388
389/// Crawl a site and collect all results.
390#[allow(clippy::needless_pass_by_value)]
391pub fn crawl(opts: CrawlOptions) -> crate::error::Result<Vec<CrawlResult>> {
392    let mut results = Vec::new();
393    crawl_each(opts, |r| results.push(r.clone()))?;
394    Ok(results)
395}
396
397/// Fetch a URL and return readable Markdown.
398pub fn markdown(url: &str) -> crate::error::Result<String> {
399    fetch(FetchOptions::new(url))?.markdown_with_url(url)
400}
401
402/// Fetch a URL and return structured JSON.
403pub fn extract_json(url: &str) -> crate::error::Result<String> {
404    fetch(FetchOptions::new(url))?.extract_json_with_url(url)
405}
406
407/// Fetch a URL and return plain text (`document.body.innerText`).
408pub fn text(url: &str) -> crate::error::Result<String> {
409    Ok(fetch(FetchOptions::new(url))?.inner_text)
410}
411
412/// Validate a URL for fetching. Rejects disallowed schemes and private addresses.
413pub fn validate_url(url: &str) -> crate::error::Result<url::Url> {
414    crate::net::validate_url(url).map_err(|e| Error::InvalidUrl {
415        url: url.into(),
416        reason: e.to_string(),
417    })
418}
419
420fn build_crawl_options(opts: &CrawlOptions) -> crate::error::Result<crate::crawl::CrawlOptions> {
421    let seed = crate::net::validate_url(&opts.url).map_err(|e| Error::InvalidUrl {
422        url: opts.url.clone(),
423        reason: e.to_string(),
424    })?;
425    let include = if opts.include.is_empty() {
426        None
427    } else {
428        Some(crate::crawl::build_globset(&opts.include).map_err(|e| Error::Engine(e.to_string()))?)
429    };
430    let exclude = if opts.exclude.is_empty() {
431        None
432    } else {
433        Some(crate::crawl::build_globset(&opts.exclude).map_err(|e| Error::Engine(e.to_string()))?)
434    };
435    Ok(crate::crawl::CrawlOptions {
436        seed,
437        limit: opts.limit,
438        max_depth: opts.max_depth,
439        timeout_secs: opts.timeout.as_secs().max(1),
440        settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
441        include,
442        exclude,
443        selector: opts.selector.clone(),
444        json: opts.json,
445    })
446}
447
448#[cfg(test)]
449mod tests {
450    use super::*;
451
452    #[test]
453    fn fetch_options_defaults() {
454        let opts = FetchOptions::new("https://example.com");
455        assert_eq!(opts.url, "https://example.com");
456        assert_eq!(opts.timeout, Duration::from_secs(30));
457        assert_eq!(opts.settle, Duration::ZERO);
458        assert!(matches!(opts.mode, FetchMode::Content));
459    }
460
461    #[test]
462    fn fetch_options_screenshot() {
463        let opts = FetchOptions::screenshot("https://example.com", true);
464        assert!(matches!(opts.mode, FetchMode::Screenshot { full_page: true }));
465    }
466
467    #[test]
468    fn fetch_options_javascript() {
469        let opts = FetchOptions::javascript("https://example.com", "document.title");
470        assert!(matches!(opts.mode, FetchMode::JavaScript(ref e) if e == "document.title"));
471    }
472
473    #[test]
474    fn fetch_options_chaining() {
475        let opts = FetchOptions::new("https://example.com")
476            .timeout(Duration::from_secs(60))
477            .settle(Duration::from_millis(500));
478        assert_eq!(opts.timeout, Duration::from_secs(60));
479        assert_eq!(opts.settle, Duration::from_millis(500));
480    }
481
482    #[test]
483    fn crawl_options_defaults() {
484        let opts = CrawlOptions::new("https://example.com");
485        assert_eq!(opts.url, "https://example.com");
486        assert_eq!(opts.limit, 50);
487        assert_eq!(opts.max_depth, 3);
488        assert_eq!(opts.timeout, Duration::from_secs(30));
489        assert!(opts.include.is_empty());
490        assert!(opts.exclude.is_empty());
491    }
492
493    #[test]
494    fn crawl_options_chaining() {
495        let opts = CrawlOptions::new("https://example.com")
496            .limit(100)
497            .max_depth(5)
498            .timeout(Duration::from_secs(60))
499            .include(&["/docs/**"])
500            .exclude(&["/docs/archive/**"]);
501        assert_eq!(opts.limit, 100);
502        assert_eq!(opts.max_depth, 5);
503        assert_eq!(opts.include, vec!["/docs/**"]);
504        assert_eq!(opts.exclude, vec!["/docs/archive/**"]);
505    }
506
507    #[test]
508    fn page_markdown_from_html() {
509        let page = Page {
510            html: "<html><head><title>Test</title></head><body><p>hello world</p></body></html>".into(),
511            inner_text: "hello world".into(),
512            ..Page::default()
513        };
514        let md = page.markdown().unwrap();
515        assert!(md.contains("hello world"));
516    }
517
518    #[test]
519    fn page_extract_json_produces_valid_json() {
520        let page = Page {
521            html: "<html><head><title>Test</title></head><body><p>content</p></body></html>".into(),
522            inner_text: "content".into(),
523            ..Page::default()
524        };
525        let json = page.extract_json().unwrap();
526        let _: serde_json::Value = serde_json::from_str(&json).expect("valid JSON");
527    }
528
529    #[test]
530    fn page_screenshot_png_none_by_default() {
531        let page = Page::default();
532        assert!(page.screenshot_png().is_none());
533    }
534
535    #[test]
536    fn fetch_rejects_invalid_url() {
537        let result = fetch(FetchOptions::new("not a url"));
538        assert!(result.is_err());
539        let err = result.unwrap_err();
540        assert!(matches!(err, Error::InvalidUrl { .. }));
541    }
542
543    #[test]
544    fn fetch_rejects_private_ip() {
545        let result = fetch(FetchOptions::new("http://127.0.0.1/"));
546        assert!(result.is_err());
547    }
548
549    #[test]
550    fn fetch_rejects_file_scheme() {
551        let result = fetch(FetchOptions::new("file:///etc/passwd"));
552        assert!(result.is_err());
553    }
554}