servo_fetch/
engine.rs

1//! Servo browser engine facade.
2
3use std::time::Duration;
4
5use crate::error::Error;
6
7/// Rendered page returned by [`fetch`].
8#[derive(Debug, Clone, Default, serde::Serialize)]
9#[non_exhaustive]
10pub struct Page {
11    /// Fully rendered HTML after JavaScript execution.
12    pub html: String,
13    /// Plain text content (`document.body.innerText`).
14    pub inner_text: String,
15    /// Page title extracted from `<title>` tag.
16    pub title: Option<String>,
17    /// Parsed layout data from the injected CSS heuristics script.
18    #[serde(skip_serializing_if = "Option::is_none")]
19    pub layout_json: Option<String>,
20    /// Result of JavaScript evaluation, if [`FetchOptions::javascript`] was used.
21    #[serde(skip_serializing_if = "Option::is_none")]
22    pub js_result: Option<String>,
23    /// Browser console messages captured during page load.
24    pub console_messages: Vec<ConsoleMessage>,
25    /// Accessibility tree (AccessKit), if requested.
26    #[serde(skip_serializing_if = "Option::is_none")]
27    pub accessibility_tree: Option<String>,
28    #[serde(skip)]
29    screenshot_png: Option<Vec<u8>>,
30}
31
32impl Page {
33    /// Extract readable Markdown from this page.
34    pub fn markdown(&self) -> crate::error::Result<String> {
35        self.markdown_with_url("")
36    }
37
38    /// Extract readable Markdown, using the original URL for link resolution.
39    pub fn markdown_with_url(&self, url: &str) -> crate::error::Result<String> {
40        let input = crate::extract::ExtractInput::new(&self.html, url)
41            .with_layout_json(self.layout_json.as_deref())
42            .with_inner_text(Some(&self.inner_text));
43        Ok(crate::extract::extract_text(&input)?)
44    }
45
46    /// Extract structured JSON from this page.
47    pub fn extract_json(&self) -> crate::error::Result<String> {
48        self.extract_json_with_url("")
49    }
50
51    /// Extract structured JSON, using the original URL for link resolution.
52    pub fn extract_json_with_url(&self, url: &str) -> crate::error::Result<String> {
53        let input = crate::extract::ExtractInput::new(&self.html, url)
54            .with_layout_json(self.layout_json.as_deref())
55            .with_inner_text(Some(&self.inner_text));
56        Ok(crate::extract::extract_json(&input)?)
57    }
58
59    /// PNG screenshot bytes, if captured via [`FetchOptions::screenshot`].
60    #[must_use]
61    pub fn screenshot_png(&self) -> Option<&[u8]> {
62        self.screenshot_png.as_deref()
63    }
64
65    pub(crate) fn from_servo(page: crate::bridge::ServoPage) -> Self {
66        let title = {
67            let doc = dom_query::Document::from(page.html.as_str());
68            let t = doc.select("title").text().to_string();
69            if t.is_empty() { None } else { Some(t) }
70        };
71        let screenshot_png = page.screenshot.and_then(|img| {
72            let mut buf = std::io::Cursor::new(Vec::new());
73            img.write_to(&mut buf, image::ImageFormat::Png).ok()?;
74            Some(buf.into_inner())
75        });
76        Self {
77            html: page.html,
78            inner_text: page.inner_text.unwrap_or_default(),
79            title,
80            layout_json: page.layout_json,
81            js_result: page.js_result,
82            console_messages: page
83                .console_messages
84                .into_iter()
85                .map(|m| ConsoleMessage {
86                    level: match m.level {
87                        crate::bridge::ConsoleLevel::Log => ConsoleLevel::Log,
88                        crate::bridge::ConsoleLevel::Debug => ConsoleLevel::Debug,
89                        crate::bridge::ConsoleLevel::Info => ConsoleLevel::Info,
90                        crate::bridge::ConsoleLevel::Warn => ConsoleLevel::Warn,
91                        crate::bridge::ConsoleLevel::Error => ConsoleLevel::Error,
92                        crate::bridge::ConsoleLevel::Trace => ConsoleLevel::Trace,
93                    },
94                    message: m.message,
95                })
96                .collect(),
97            screenshot_png,
98            accessibility_tree: page.accessibility_tree,
99        }
100    }
101}
102
103/// Browser console message captured during page load.
104#[derive(Debug, Clone, serde::Serialize)]
105#[non_exhaustive]
106pub struct ConsoleMessage {
107    /// Severity level.
108    pub level: ConsoleLevel,
109    /// Message text.
110    pub message: String,
111}
112
113/// Console message severity.
114#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
115#[serde(rename_all = "lowercase")]
116#[non_exhaustive]
117pub enum ConsoleLevel {
118    /// General log message.
119    Log,
120    /// Debug-level message.
121    Debug,
122    /// Informational message.
123    Info,
124    /// Warning message.
125    Warn,
126    /// Error message.
127    Error,
128    /// Trace-level message.
129    Trace,
130}
131
132#[derive(Debug, Clone, Default)]
133pub(crate) enum FetchMode {
134    #[default]
135    Content,
136    Screenshot {
137        full_page: bool,
138    },
139    JavaScript(String),
140}
141
142/// Options for a single page fetch.
143///
144/// # Thread Safety
145///
146/// [`fetch`] is safe to call from multiple threads. Each call queues a request
147/// to the shared Servo engine thread, which processes them sequentially.
148#[must_use = "options do nothing until passed to fetch()"]
149#[derive(Debug, Clone)]
150pub struct FetchOptions {
151    pub(crate) url: String,
152    pub(crate) timeout: Duration,
153    pub(crate) settle: Duration,
154    pub(crate) mode: FetchMode,
155}
156
157impl FetchOptions {
158    /// Fetch rendered content (default mode).
159    pub fn new(url: &str) -> Self {
160        Self {
161            url: url.into(),
162            timeout: Duration::from_secs(30),
163            settle: Duration::ZERO,
164            mode: FetchMode::Content,
165        }
166    }
167
168    /// Capture a PNG screenshot.
169    pub fn screenshot(url: &str, full_page: bool) -> Self {
170        Self {
171            mode: FetchMode::Screenshot { full_page },
172            ..Self::new(url)
173        }
174    }
175
176    /// Execute a JavaScript expression and return the result.
177    pub fn javascript(url: &str, expression: impl Into<String>) -> Self {
178        Self {
179            mode: FetchMode::JavaScript(expression.into()),
180            ..Self::new(url)
181        }
182    }
183
184    /// Page load timeout (default: 30s).
185    pub fn timeout(mut self, timeout: Duration) -> Self {
186        self.timeout = timeout;
187        self
188    }
189
190    /// Extra wait after load event for SPA hydration (default: 0).
191    pub fn settle(mut self, settle: Duration) -> Self {
192        self.settle = settle;
193        self
194    }
195}
196
197/// Fetch a single page via the embedded Servo engine.
198///
199/// The first call spawns a persistent engine thread that lives for the process
200/// lifetime. If the engine thread panics, this returns [`Error::Engine`].
201#[allow(clippy::needless_pass_by_value)]
202pub fn fetch(opts: FetchOptions) -> crate::error::Result<Page> {
203    ensure_crypto_provider();
204
205    crate::net::validate_url(&opts.url).map_err(|e| Error::InvalidUrl {
206        url: opts.url.clone(),
207        reason: e.to_string(),
208    })?;
209
210    if matches!(opts.mode, FetchMode::Content)
211        && let Some(bytes) = crate::pdf::probe(&opts.url, opts.timeout.as_secs().max(1))
212    {
213        let text = crate::extract::extract_pdf(&bytes);
214        return Ok(Page {
215            html: String::new(),
216            inner_text: text,
217            ..Page::default()
218        });
219    }
220
221    let bridge_opts = crate::bridge::FetchOptions {
222        url: &opts.url,
223        timeout_secs: opts.timeout.as_secs().max(1),
224        settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
225        mode: match opts.mode {
226            FetchMode::Content => crate::bridge::FetchMode::Content { include_a11y: false },
227            FetchMode::Screenshot { full_page } => crate::bridge::FetchMode::Screenshot { full_page },
228            FetchMode::JavaScript(ref expr) => crate::bridge::FetchMode::ExecuteJs {
229                expression: expr.clone(),
230            },
231        },
232    };
233
234    let servo_page = crate::bridge::fetch_page(bridge_opts).map_err(|e| {
235        let msg = format!("{e:#}");
236        if msg.contains("timed out") {
237            Error::Timeout {
238                url: opts.url.clone(),
239                timeout: opts.timeout,
240            }
241        } else {
242            Error::Engine(msg)
243        }
244    })?;
245
246    Ok(Page::from_servo(servo_page))
247}
248
249/// Options for crawling a site.
250#[must_use = "options do nothing until passed to crawl() or crawl_each()"]
251#[derive(Debug, Clone)]
252pub struct CrawlOptions {
253    pub(crate) url: String,
254    pub(crate) limit: usize,
255    pub(crate) max_depth: usize,
256    pub(crate) timeout: Duration,
257    pub(crate) settle: Duration,
258    pub(crate) include: Vec<String>,
259    pub(crate) exclude: Vec<String>,
260    pub(crate) selector: Option<String>,
261    pub(crate) json: bool,
262}
263
264impl CrawlOptions {
265    /// Create crawl options for the given seed URL.
266    pub fn new(url: &str) -> Self {
267        Self {
268            url: url.into(),
269            limit: 50,
270            max_depth: 3,
271            timeout: Duration::from_secs(30),
272            settle: Duration::ZERO,
273            include: Vec::new(),
274            exclude: Vec::new(),
275            selector: None,
276            json: false,
277        }
278    }
279
280    /// Maximum number of pages to crawl (default: 50).
281    pub fn limit(mut self, n: usize) -> Self {
282        self.limit = n;
283        self
284    }
285
286    /// Maximum link depth from the seed URL (default: 3).
287    pub fn max_depth(mut self, n: usize) -> Self {
288        self.max_depth = n;
289        self
290    }
291
292    /// Page load timeout per page (default: 30s).
293    pub fn timeout(mut self, timeout: Duration) -> Self {
294        self.timeout = timeout;
295        self
296    }
297
298    /// Extra wait after load event per page (default: 0).
299    pub fn settle(mut self, settle: Duration) -> Self {
300        self.settle = settle;
301        self
302    }
303
304    /// URL path glob patterns to include (e.g. `"/docs/**"`).
305    pub fn include(mut self, patterns: &[&str]) -> Self {
306        self.include = patterns.iter().map(|s| (*s).to_string()).collect();
307        self
308    }
309
310    /// URL path glob patterns to exclude (e.g. `"/docs/archive/**"`).
311    pub fn exclude(mut self, patterns: &[&str]) -> Self {
312        self.exclude = patterns.iter().map(|s| (*s).to_string()).collect();
313        self
314    }
315
316    /// Output crawled content as JSON instead of Markdown.
317    pub fn json(mut self, json: bool) -> Self {
318        self.json = json;
319        self
320    }
321
322    /// CSS selector to extract a specific section per page.
323    pub fn selector(mut self, selector: impl Into<String>) -> Self {
324        self.selector = Some(selector.into());
325        self
326    }
327}
328
329/// Result for a single crawled page.
330#[derive(Debug, Clone, serde::Serialize)]
331#[non_exhaustive]
332pub struct CrawlResult {
333    /// URL of the crawled page.
334    pub url: String,
335    /// Link depth from the seed URL.
336    pub depth: usize,
337    /// Whether the page was fetched successfully.
338    pub status: CrawlStatus,
339    /// Page title, if extraction succeeded.
340    #[serde(skip_serializing_if = "Option::is_none")]
341    pub title: Option<String>,
342    /// Extracted content (Markdown or JSON depending on options).
343    #[serde(skip_serializing_if = "Option::is_none")]
344    pub content: Option<String>,
345    /// Error message, if the page failed to load.
346    #[serde(skip_serializing_if = "Option::is_none")]
347    pub error: Option<String>,
348    /// Number of links discovered on this page.
349    pub links_found: usize,
350}
351
352impl CrawlResult {
353    fn from_internal(r: &crate::crawl::CrawlPageResult) -> Self {
354        Self {
355            url: r.url.clone(),
356            depth: r.depth,
357            status: match r.status {
358                crate::crawl::CrawlStatus::Ok => CrawlStatus::Ok,
359                crate::crawl::CrawlStatus::Error => CrawlStatus::Error,
360            },
361            title: r.title.clone(),
362            content: r.content.clone(),
363            error: r.error.clone(),
364            links_found: r.links_found,
365        }
366    }
367}
368
369/// Status of a crawled page.
370#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
371#[serde(rename_all = "lowercase")]
372#[non_exhaustive]
373pub enum CrawlStatus {
374    /// Page fetched and extracted successfully.
375    Ok,
376    /// Page failed to load or extract.
377    Error,
378}
379
380/// Crawl a site, invoking `on_page` for each result as it arrives.
381#[allow(clippy::needless_pass_by_value)]
382pub fn crawl_each(opts: CrawlOptions, mut on_page: impl FnMut(&CrawlResult)) -> crate::error::Result<()> {
383    let internal_opts = build_crawl_options(&opts)?;
384    crate::runtime::block_on(crate::crawl::run(internal_opts, |r| {
385        on_page(&CrawlResult::from_internal(r));
386    }))
387    .map_err(|e| Error::Engine(e.to_string()))?;
388    Ok(())
389}
390
391/// Crawl a site and collect all results.
392#[allow(clippy::needless_pass_by_value)]
393pub fn crawl(opts: CrawlOptions) -> crate::error::Result<Vec<CrawlResult>> {
394    let mut results = Vec::new();
395    crawl_each(opts, |r| results.push(r.clone()))?;
396    Ok(results)
397}
398
399/// Fetch a URL and return readable Markdown.
400pub fn markdown(url: &str) -> crate::error::Result<String> {
401    fetch(FetchOptions::new(url))?.markdown_with_url(url)
402}
403
404/// Fetch a URL and return structured JSON.
405pub fn extract_json(url: &str) -> crate::error::Result<String> {
406    fetch(FetchOptions::new(url))?.extract_json_with_url(url)
407}
408
409/// Fetch a URL and return plain text (`document.body.innerText`).
410pub fn text(url: &str) -> crate::error::Result<String> {
411    Ok(fetch(FetchOptions::new(url))?.inner_text)
412}
413
414/// Validate a URL for fetching. Rejects disallowed schemes and private addresses.
415pub fn validate_url(url: &str) -> crate::error::Result<url::Url> {
416    crate::net::validate_url(url).map_err(|e| Error::InvalidUrl {
417        url: url.into(),
418        reason: e.to_string(),
419    })
420}
421
422fn ensure_crypto_provider() {
423    let _ = rustls::crypto::aws_lc_rs::default_provider().install_default();
424}
425
426fn build_crawl_options(opts: &CrawlOptions) -> crate::error::Result<crate::crawl::CrawlOptions> {
427    let seed = crate::net::validate_url(&opts.url).map_err(|e| Error::InvalidUrl {
428        url: opts.url.clone(),
429        reason: e.to_string(),
430    })?;
431    let include = if opts.include.is_empty() {
432        None
433    } else {
434        Some(crate::crawl::build_globset(&opts.include).map_err(|e| Error::Engine(e.to_string()))?)
435    };
436    let exclude = if opts.exclude.is_empty() {
437        None
438    } else {
439        Some(crate::crawl::build_globset(&opts.exclude).map_err(|e| Error::Engine(e.to_string()))?)
440    };
441    Ok(crate::crawl::CrawlOptions {
442        seed,
443        limit: opts.limit,
444        max_depth: opts.max_depth,
445        timeout_secs: opts.timeout.as_secs().max(1),
446        settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
447        include,
448        exclude,
449        selector: opts.selector.clone(),
450        json: opts.json,
451    })
452}
453
454#[cfg(test)]
455mod tests {
456    use super::*;
457
458    #[test]
459    fn fetch_options_defaults() {
460        let opts = FetchOptions::new("https://example.com");
461        assert_eq!(opts.url, "https://example.com");
462        assert_eq!(opts.timeout, Duration::from_secs(30));
463        assert_eq!(opts.settle, Duration::ZERO);
464        assert!(matches!(opts.mode, FetchMode::Content));
465    }
466
467    #[test]
468    fn fetch_options_screenshot() {
469        let opts = FetchOptions::screenshot("https://example.com", true);
470        assert!(matches!(opts.mode, FetchMode::Screenshot { full_page: true }));
471    }
472
473    #[test]
474    fn fetch_options_javascript() {
475        let opts = FetchOptions::javascript("https://example.com", "document.title");
476        assert!(matches!(opts.mode, FetchMode::JavaScript(ref e) if e == "document.title"));
477    }
478
479    #[test]
480    fn fetch_options_chaining() {
481        let opts = FetchOptions::new("https://example.com")
482            .timeout(Duration::from_secs(60))
483            .settle(Duration::from_millis(500));
484        assert_eq!(opts.timeout, Duration::from_secs(60));
485        assert_eq!(opts.settle, Duration::from_millis(500));
486    }
487
488    #[test]
489    fn crawl_options_defaults() {
490        let opts = CrawlOptions::new("https://example.com");
491        assert_eq!(opts.url, "https://example.com");
492        assert_eq!(opts.limit, 50);
493        assert_eq!(opts.max_depth, 3);
494        assert_eq!(opts.timeout, Duration::from_secs(30));
495        assert!(opts.include.is_empty());
496        assert!(opts.exclude.is_empty());
497    }
498
499    #[test]
500    fn crawl_options_chaining() {
501        let opts = CrawlOptions::new("https://example.com")
502            .limit(100)
503            .max_depth(5)
504            .timeout(Duration::from_secs(60))
505            .include(&["/docs/**"])
506            .exclude(&["/docs/archive/**"]);
507        assert_eq!(opts.limit, 100);
508        assert_eq!(opts.max_depth, 5);
509        assert_eq!(opts.include, vec!["/docs/**"]);
510        assert_eq!(opts.exclude, vec!["/docs/archive/**"]);
511    }
512
513    #[test]
514    fn page_markdown_from_html() {
515        let page = Page {
516            html: "<html><head><title>Test</title></head><body><p>hello world</p></body></html>".into(),
517            inner_text: "hello world".into(),
518            ..Page::default()
519        };
520        let md = page.markdown().unwrap();
521        assert!(md.contains("hello world"));
522    }
523
524    #[test]
525    fn page_extract_json_produces_valid_json() {
526        let page = Page {
527            html: "<html><head><title>Test</title></head><body><p>content</p></body></html>".into(),
528            inner_text: "content".into(),
529            ..Page::default()
530        };
531        let json = page.extract_json().unwrap();
532        let _: serde_json::Value = serde_json::from_str(&json).expect("valid JSON");
533    }
534
535    #[test]
536    fn page_screenshot_png_none_by_default() {
537        let page = Page::default();
538        assert!(page.screenshot_png().is_none());
539    }
540
541    #[test]
542    fn fetch_rejects_invalid_url() {
543        let result = fetch(FetchOptions::new("not a url"));
544        assert!(result.is_err());
545        let err = result.unwrap_err();
546        assert!(matches!(err, Error::InvalidUrl { .. }));
547    }
548
549    #[test]
550    fn fetch_rejects_private_ip() {
551        let result = fetch(FetchOptions::new("http://127.0.0.1/"));
552        assert!(result.is_err());
553    }
554
555    #[test]
556    fn fetch_rejects_file_scheme() {
557        let result = fetch(FetchOptions::new("file:///etc/passwd"));
558        assert!(result.is_err());
559    }
560}
servo_fetch/engine.rs

servo_fetch/
engine.rs