Skip to main content

servo_fetch/
engine.rs

1//! Servo browser engine facade.
2
3use std::time::Duration;
4
5use crate::error::Error;
6
7/// Rendered page returned by [`fetch`].
8#[derive(Debug, Clone, Default, serde::Serialize)]
9#[non_exhaustive]
10pub struct Page {
11    /// Fully rendered HTML after JavaScript execution.
12    pub html: String,
13    /// Plain text content (`document.body.innerText`).
14    pub inner_text: String,
15    /// Page title extracted from `<title>` tag.
16    pub title: Option<String>,
17    /// Parsed layout data from the injected CSS heuristics script.
18    #[serde(skip_serializing_if = "Option::is_none")]
19    pub layout_json: Option<String>,
20    /// Result of JavaScript evaluation, if [`FetchOptions::javascript`] was used.
21    #[serde(skip_serializing_if = "Option::is_none")]
22    pub js_result: Option<String>,
23    /// Browser console messages captured during page load.
24    pub console_messages: Vec<ConsoleMessage>,
25    /// Accessibility tree (AccessKit), if requested.
26    #[serde(skip_serializing_if = "Option::is_none")]
27    pub accessibility_tree: Option<String>,
28    #[serde(skip)]
29    screenshot_png: Option<Vec<u8>>,
30}
31
32impl Page {
33    /// Extract readable Markdown from this page.
34    pub fn markdown(&self) -> crate::error::Result<String> {
35        self.markdown_with_url("")
36    }
37
38    /// Extract readable Markdown, using the original URL for link resolution.
39    pub fn markdown_with_url(&self, url: &str) -> crate::error::Result<String> {
40        let input = crate::extract::ExtractInput::new(&self.html, url)
41            .with_layout_json(self.layout_json.as_deref())
42            .with_inner_text(Some(&self.inner_text));
43        Ok(crate::extract::extract_text(&input)?)
44    }
45
46    /// Extract structured JSON from this page.
47    pub fn extract_json(&self) -> crate::error::Result<String> {
48        self.extract_json_with_url("")
49    }
50
51    /// Extract structured JSON, using the original URL for link resolution.
52    pub fn extract_json_with_url(&self, url: &str) -> crate::error::Result<String> {
53        let input = crate::extract::ExtractInput::new(&self.html, url)
54            .with_layout_json(self.layout_json.as_deref())
55            .with_inner_text(Some(&self.inner_text));
56        Ok(crate::extract::extract_json(&input)?)
57    }
58
59    /// PNG screenshot bytes, if captured via [`FetchOptions::screenshot`].
60    #[must_use]
61    pub fn screenshot_png(&self) -> Option<&[u8]> {
62        self.screenshot_png.as_deref()
63    }
64
65    pub(crate) fn from_servo(page: crate::bridge::ServoPage) -> Self {
66        let title = {
67            let doc = dom_query::Document::from(page.html.as_str());
68            let t = doc.select("title").text().to_string();
69            if t.is_empty() { None } else { Some(t) }
70        };
71        let screenshot_png = page.screenshot.and_then(|img| {
72            let mut buf = std::io::Cursor::new(Vec::new());
73            img.write_to(&mut buf, image::ImageFormat::Png).ok()?;
74            Some(buf.into_inner())
75        });
76        Self {
77            html: page.html,
78            inner_text: page.inner_text.unwrap_or_default(),
79            title,
80            layout_json: page.layout_json,
81            js_result: page.js_result,
82            console_messages: page
83                .console_messages
84                .into_iter()
85                .map(|m| ConsoleMessage {
86                    level: match m.level {
87                        crate::bridge::ConsoleLevel::Log => ConsoleLevel::Log,
88                        crate::bridge::ConsoleLevel::Debug => ConsoleLevel::Debug,
89                        crate::bridge::ConsoleLevel::Info => ConsoleLevel::Info,
90                        crate::bridge::ConsoleLevel::Warn => ConsoleLevel::Warn,
91                        crate::bridge::ConsoleLevel::Error => ConsoleLevel::Error,
92                        crate::bridge::ConsoleLevel::Trace => ConsoleLevel::Trace,
93                    },
94                    message: m.message,
95                })
96                .collect(),
97            screenshot_png,
98            accessibility_tree: page.accessibility_tree,
99        }
100    }
101}
102
103/// Browser console message captured during page load.
104#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
105#[non_exhaustive]
106pub struct ConsoleMessage {
107    /// Severity level.
108    pub level: ConsoleLevel,
109    /// Message text.
110    pub message: String,
111}
112
113/// Console message severity.
114#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
115#[serde(rename_all = "lowercase")]
116#[non_exhaustive]
117pub enum ConsoleLevel {
118    /// General log message.
119    Log,
120    /// Debug-level message.
121    Debug,
122    /// Informational message.
123    Info,
124    /// Warning message.
125    Warn,
126    /// Error message.
127    Error,
128    /// Trace-level message.
129    Trace,
130}
131
132impl ConsoleLevel {
133    /// Returns the string representation of this level.
134    #[must_use]
135    pub fn as_str(&self) -> &'static str {
136        match self {
137            Self::Log => "log",
138            Self::Debug => "debug",
139            Self::Info => "info",
140            Self::Warn => "warn",
141            Self::Error => "error",
142            Self::Trace => "trace",
143        }
144    }
145}
146
147impl std::fmt::Display for ConsoleLevel {
148    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
149        f.pad(self.as_str())
150    }
151}
152
153#[derive(Debug, Clone, Default)]
154pub(crate) enum FetchMode {
155    #[default]
156    Content,
157    Screenshot {
158        full_page: bool,
159    },
160    JavaScript(String),
161}
162
163/// Options for a single page fetch.
164///
165/// # Thread Safety
166///
167/// [`fetch`] is safe to call from multiple threads. Each call queues a request
168/// to the shared Servo engine thread, which processes them sequentially.
169#[must_use = "options do nothing until passed to fetch()"]
170#[derive(Debug, Clone)]
171pub struct FetchOptions {
172    pub(crate) url: String,
173    pub(crate) timeout: Duration,
174    pub(crate) settle: Duration,
175    pub(crate) mode: FetchMode,
176    pub(crate) user_agent: Option<String>,
177}
178
179impl FetchOptions {
180    /// Fetch rendered content (default mode).
181    pub fn new(url: &str) -> Self {
182        Self {
183            url: url.into(),
184            timeout: Duration::from_secs(30),
185            settle: Duration::ZERO,
186            mode: FetchMode::Content,
187            user_agent: None,
188        }
189    }
190
191    /// Capture a PNG screenshot.
192    pub fn screenshot(url: &str, full_page: bool) -> Self {
193        Self {
194            mode: FetchMode::Screenshot { full_page },
195            ..Self::new(url)
196        }
197    }
198
199    /// Execute a JavaScript expression and return the result.
200    pub fn javascript(url: &str, expression: impl Into<String>) -> Self {
201        Self {
202            mode: FetchMode::JavaScript(expression.into()),
203            ..Self::new(url)
204        }
205    }
206
207    /// Page load timeout (default: 30s).
208    pub fn timeout(mut self, timeout: Duration) -> Self {
209        self.timeout = timeout;
210        self
211    }
212
213    /// Extra wait after load event for SPA hydration (default: 0).
214    pub fn settle(mut self, settle: Duration) -> Self {
215        self.settle = settle;
216        self
217    }
218
219    /// Override the User-Agent string for this request.
220    pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
221        self.user_agent = Some(sanitize_user_agent(ua.into()));
222        self
223    }
224}
225
226/// Fetch a single page via the embedded Servo engine.
227///
228/// The first call spawns a persistent engine thread that lives for the process
229/// lifetime. If the engine thread panics, this returns [`Error::Engine`].
230#[allow(clippy::needless_pass_by_value)]
231pub fn fetch(opts: FetchOptions) -> crate::error::Result<Page> {
232    ensure_crypto_provider();
233
234    crate::net::validate_url(&opts.url).map_err(|e| map_url_error(&opts.url, e))?;
235
236    if matches!(opts.mode, FetchMode::Content)
237        && let Some(bytes) = crate::pdf::probe(&opts.url, opts.timeout.as_secs().max(1))
238    {
239        let text = crate::extract::extract_pdf(&bytes);
240        return Ok(Page {
241            html: String::new(),
242            inner_text: text,
243            ..Page::default()
244        });
245    }
246
247    let bridge_opts = crate::bridge::FetchOptions {
248        url: &opts.url,
249        timeout_secs: opts.timeout.as_secs().max(1),
250        settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
251        user_agent: opts.user_agent.as_deref(),
252        mode: match opts.mode {
253            FetchMode::Content => crate::bridge::FetchMode::Content { include_a11y: false },
254            FetchMode::Screenshot { full_page } => crate::bridge::FetchMode::Screenshot { full_page },
255            FetchMode::JavaScript(ref expr) => crate::bridge::FetchMode::ExecuteJs {
256                expression: expr.clone(),
257            },
258        },
259    };
260
261    let servo_page = crate::bridge::fetch_page(bridge_opts).map_err(|e| {
262        let msg = format!("{e:#}");
263        if msg.contains("timed out") {
264            Error::Timeout {
265                url: opts.url.clone(),
266                timeout: opts.timeout,
267            }
268        } else {
269            Error::Engine(msg)
270        }
271    })?;
272
273    Ok(Page::from_servo(servo_page))
274}
275
276/// Options for crawling a site.
277#[must_use = "options do nothing until passed to crawl() or crawl_each()"]
278#[derive(Debug, Clone)]
279pub struct CrawlOptions {
280    pub(crate) url: String,
281    pub(crate) limit: usize,
282    pub(crate) max_depth: usize,
283    pub(crate) timeout: Duration,
284    pub(crate) settle: Duration,
285    pub(crate) include: Vec<String>,
286    pub(crate) exclude: Vec<String>,
287    pub(crate) selector: Option<String>,
288    pub(crate) json: bool,
289    pub(crate) user_agent: Option<String>,
290}
291
292impl CrawlOptions {
293    /// Create crawl options for the given seed URL.
294    pub fn new(url: &str) -> Self {
295        Self {
296            url: url.into(),
297            limit: 50,
298            max_depth: 3,
299            timeout: Duration::from_secs(30),
300            settle: Duration::ZERO,
301            include: Vec::new(),
302            exclude: Vec::new(),
303            selector: None,
304            json: false,
305            user_agent: None,
306        }
307    }
308
309    /// Maximum number of pages to crawl (default: 50).
310    pub fn limit(mut self, n: usize) -> Self {
311        self.limit = n;
312        self
313    }
314
315    /// Maximum link depth from the seed URL (default: 3).
316    pub fn max_depth(mut self, n: usize) -> Self {
317        self.max_depth = n;
318        self
319    }
320
321    /// Page load timeout per page (default: 30s).
322    pub fn timeout(mut self, timeout: Duration) -> Self {
323        self.timeout = timeout;
324        self
325    }
326
327    /// Extra wait after load event per page (default: 0).
328    pub fn settle(mut self, settle: Duration) -> Self {
329        self.settle = settle;
330        self
331    }
332
333    /// URL path glob patterns to include (e.g. `"/docs/**"`).
334    pub fn include(mut self, patterns: &[&str]) -> Self {
335        self.include = patterns.iter().map(|s| (*s).to_string()).collect();
336        self
337    }
338
339    /// URL path glob patterns to exclude (e.g. `"/docs/archive/**"`).
340    pub fn exclude(mut self, patterns: &[&str]) -> Self {
341        self.exclude = patterns.iter().map(|s| (*s).to_string()).collect();
342        self
343    }
344
345    /// Output crawled content as JSON instead of Markdown.
346    pub fn json(mut self, json: bool) -> Self {
347        self.json = json;
348        self
349    }
350
351    /// CSS selector to extract a specific section per page.
352    pub fn selector(mut self, selector: impl Into<String>) -> Self {
353        self.selector = Some(selector.into());
354        self
355    }
356
357    /// Override the User-Agent string for all pages in this crawl.
358    pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
359        self.user_agent = Some(sanitize_user_agent(ua.into()));
360        self
361    }
362}
363
364/// Result for a single crawled page.
365#[derive(Debug, Clone)]
366#[non_exhaustive]
367pub struct CrawlResult {
368    /// URL of the crawled page.
369    pub url: String,
370    /// Link depth from the seed URL.
371    pub depth: usize,
372    /// Page content if successful, or error if failed.
373    pub outcome: Result<CrawlPage, CrawlError>,
374}
375
376/// Successfully crawled page.
377#[derive(Debug, Clone)]
378pub struct CrawlPage {
379    /// Page title.
380    pub title: Option<String>,
381    /// Extracted content (Markdown or JSON depending on options).
382    pub content: String,
383    /// Number of links discovered on this page.
384    pub links_found: usize,
385}
386
387/// Error from a failed crawl attempt.
388#[derive(Debug, Clone)]
389pub struct CrawlError {
390    /// Error message.
391    pub message: String,
392}
393
394impl std::fmt::Display for CrawlError {
395    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
396        f.write_str(&self.message)
397    }
398}
399
400impl std::error::Error for CrawlError {}
401
402impl serde::Serialize for CrawlResult {
403    fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
404        use serde::ser::SerializeMap;
405        match &self.outcome {
406            Ok(page) => {
407                let mut map = serializer.serialize_map(None)?;
408                map.serialize_entry("url", &self.url)?;
409                map.serialize_entry("depth", &self.depth)?;
410                map.serialize_entry("status", "ok")?;
411                if let Some(t) = &page.title {
412                    map.serialize_entry("title", t)?;
413                }
414                map.serialize_entry("content", &page.content)?;
415                map.serialize_entry("links_found", &page.links_found)?;
416                map.end()
417            }
418            Err(e) => {
419                let mut map = serializer.serialize_map(None)?;
420                map.serialize_entry("url", &self.url)?;
421                map.serialize_entry("depth", &self.depth)?;
422                map.serialize_entry("status", "error")?;
423                map.serialize_entry("error", &e.message)?;
424                map.end()
425            }
426        }
427    }
428}
429
430impl CrawlResult {
431    fn from_internal(r: &crate::crawl::CrawlPageResult) -> Self {
432        let outcome = match r.status {
433            crate::crawl::CrawlStatus::Ok => Ok(CrawlPage {
434                title: r.title.clone(),
435                content: r.content.clone().unwrap_or_default(),
436                links_found: r.links_found,
437            }),
438            crate::crawl::CrawlStatus::Error => Err(CrawlError {
439                message: r.error.clone().unwrap_or_default(),
440            }),
441        };
442        Self {
443            url: r.url.clone(),
444            depth: r.depth,
445            outcome,
446        }
447    }
448}
449
450/// Status of a crawled page.
451#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
452#[serde(rename_all = "lowercase")]
453#[non_exhaustive]
454pub enum CrawlStatus {
455    /// Page fetched and extracted successfully.
456    Ok,
457    /// Page failed to load or extract.
458    Error,
459}
460
461/// Crawl a site, invoking `on_page` for each result as it arrives.
462#[allow(clippy::needless_pass_by_value)]
463pub fn crawl_each(opts: CrawlOptions, mut on_page: impl FnMut(&CrawlResult)) -> crate::error::Result<()> {
464    ensure_crypto_provider();
465    let internal_opts = build_crawl_options(&opts)?;
466    crate::runtime::block_on(crate::crawl::run(internal_opts, |r| {
467        on_page(&CrawlResult::from_internal(r));
468    }))
469    .map_err(|e| Error::Engine(e.to_string()))?;
470    Ok(())
471}
472
473/// Crawl a site and collect all results.
474#[allow(clippy::needless_pass_by_value)]
475pub fn crawl(opts: CrawlOptions) -> crate::error::Result<Vec<CrawlResult>> {
476    let mut results = Vec::new();
477    crawl_each(opts, |r| results.push(r.clone()))?;
478    Ok(results)
479}
480
481/// Fetch a URL and return readable Markdown.
482pub fn markdown(url: &str) -> crate::error::Result<String> {
483    fetch(FetchOptions::new(url))?.markdown_with_url(url)
484}
485
486/// Fetch a URL and return structured JSON.
487pub fn extract_json(url: &str) -> crate::error::Result<String> {
488    fetch(FetchOptions::new(url))?.extract_json_with_url(url)
489}
490
491/// Fetch a URL and return plain text (`document.body.innerText`).
492pub fn text(url: &str) -> crate::error::Result<String> {
493    Ok(fetch(FetchOptions::new(url))?.inner_text)
494}
495
496/// Validate a URL for fetching. Rejects disallowed schemes and private addresses.
497pub fn validate_url(url: &str) -> crate::error::Result<url::Url> {
498    crate::net::validate_url(url).map_err(|e| map_url_error(url, e))
499}
500
501fn ensure_crypto_provider() {
502    let _ = rustls::crypto::aws_lc_rs::default_provider().install_default();
503}
504
505/// Replace CR, LF, and NUL with SP per RFC 9110.
506pub(crate) fn sanitize_user_agent(ua: String) -> String {
507    if ua.bytes().any(|b| b == b'\r' || b == b'\n' || b == 0) {
508        ua.replace(['\r', '\n', '\0'], " ")
509    } else {
510        ua
511    }
512}
513
514fn map_url_error(url: &str, e: crate::net::UrlError) -> Error {
515    match e {
516        crate::net::UrlError::PrivateAddress(host) => Error::AddressNotAllowed(host),
517        crate::net::UrlError::Invalid(reason) => Error::InvalidUrl {
518            url: url.into(),
519            reason,
520        },
521    }
522}
523
524fn build_crawl_options(opts: &CrawlOptions) -> crate::error::Result<crate::crawl::CrawlOptions> {
525    let seed = crate::net::validate_url(&opts.url).map_err(|e| map_url_error(&opts.url, e))?;
526    let include = if opts.include.is_empty() {
527        None
528    } else {
529        Some(crate::crawl::build_globset(&opts.include).map_err(|e| Error::Engine(e.to_string()))?)
530    };
531    let exclude = if opts.exclude.is_empty() {
532        None
533    } else {
534        Some(crate::crawl::build_globset(&opts.exclude).map_err(|e| Error::Engine(e.to_string()))?)
535    };
536    Ok(crate::crawl::CrawlOptions {
537        seed,
538        limit: opts.limit,
539        max_depth: opts.max_depth,
540        timeout_secs: opts.timeout.as_secs().max(1),
541        settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
542        include,
543        exclude,
544        selector: opts.selector.clone(),
545        json: opts.json,
546        user_agent: opts.user_agent.clone(),
547    })
548}
549
550#[cfg(test)]
551mod tests {
552    use super::*;
553
554    #[test]
555    fn fetch_options_defaults() {
556        let opts = FetchOptions::new("https://example.com");
557        assert_eq!(opts.url, "https://example.com");
558        assert_eq!(opts.timeout, Duration::from_secs(30));
559        assert_eq!(opts.settle, Duration::ZERO);
560        assert!(matches!(opts.mode, FetchMode::Content));
561    }
562
563    #[test]
564    fn fetch_options_screenshot() {
565        let opts = FetchOptions::screenshot("https://example.com", true);
566        assert!(matches!(opts.mode, FetchMode::Screenshot { full_page: true }));
567    }
568
569    #[test]
570    fn fetch_options_javascript() {
571        let opts = FetchOptions::javascript("https://example.com", "document.title");
572        assert!(matches!(opts.mode, FetchMode::JavaScript(ref e) if e == "document.title"));
573    }
574
575    #[test]
576    fn fetch_options_chaining() {
577        let opts = FetchOptions::new("https://example.com")
578            .timeout(Duration::from_secs(60))
579            .settle(Duration::from_millis(500));
580        assert_eq!(opts.timeout, Duration::from_secs(60));
581        assert_eq!(opts.settle, Duration::from_millis(500));
582    }
583
584    #[test]
585    fn crawl_options_defaults() {
586        let opts = CrawlOptions::new("https://example.com");
587        assert_eq!(opts.url, "https://example.com");
588        assert_eq!(opts.limit, 50);
589        assert_eq!(opts.max_depth, 3);
590        assert_eq!(opts.timeout, Duration::from_secs(30));
591        assert!(opts.include.is_empty());
592        assert!(opts.exclude.is_empty());
593    }
594
595    #[test]
596    fn crawl_options_chaining() {
597        let opts = CrawlOptions::new("https://example.com")
598            .limit(100)
599            .max_depth(5)
600            .timeout(Duration::from_secs(60))
601            .include(&["/docs/**"])
602            .exclude(&["/docs/archive/**"]);
603        assert_eq!(opts.limit, 100);
604        assert_eq!(opts.max_depth, 5);
605        assert_eq!(opts.include, vec!["/docs/**"]);
606        assert_eq!(opts.exclude, vec!["/docs/archive/**"]);
607    }
608
609    #[test]
610    fn fetch_user_agent_set() {
611        let opts = FetchOptions::new("https://example.com").user_agent("MyBot/1.0");
612        assert_eq!(opts.user_agent.as_deref(), Some("MyBot/1.0"));
613    }
614
615    #[test]
616    fn fetch_user_agent_default_is_none() {
617        let opts = FetchOptions::new("https://example.com");
618        assert!(opts.user_agent.is_none());
619    }
620
621    #[test]
622    fn fetch_user_agent_sanitizes_crlf() {
623        let opts = FetchOptions::new("https://example.com").user_agent("Bot\r\nX-Evil: yes");
624        assert_eq!(opts.user_agent.as_deref(), Some("Bot  X-Evil: yes"));
625    }
626
627    #[test]
628    fn fetch_user_agent_sanitizes_null() {
629        let opts = FetchOptions::new("https://example.com").user_agent("Bot\0/1.0");
630        assert_eq!(opts.user_agent.as_deref(), Some("Bot /1.0"));
631    }
632
633    #[test]
634    fn fetch_user_agent_empty_string() {
635        let opts = FetchOptions::new("https://example.com").user_agent("");
636        assert_eq!(opts.user_agent.as_deref(), Some(""));
637    }
638
639    #[test]
640    fn crawl_user_agent_sanitizes_crlf() {
641        let opts = CrawlOptions::new("https://example.com").user_agent("Crawler\r\n/2.0");
642        assert_eq!(opts.user_agent.as_deref(), Some("Crawler  /2.0"));
643    }
644
645    #[test]
646    fn page_markdown_from_html() {
647        let page = Page {
648            html: "<html><head><title>Test</title></head><body><p>hello world</p></body></html>".into(),
649            inner_text: "hello world".into(),
650            ..Page::default()
651        };
652        let md = page.markdown().unwrap();
653        assert!(md.contains("hello world"));
654    }
655
656    #[test]
657    fn page_extract_json_produces_valid_json() {
658        let page = Page {
659            html: "<html><head><title>Test</title></head><body><p>content</p></body></html>".into(),
660            inner_text: "content".into(),
661            ..Page::default()
662        };
663        let json = page.extract_json().unwrap();
664        let _: serde_json::Value = serde_json::from_str(&json).expect("valid JSON");
665    }
666
667    #[test]
668    fn page_screenshot_png_none_by_default() {
669        let page = Page::default();
670        assert!(page.screenshot_png().is_none());
671    }
672
673    #[test]
674    fn fetch_rejects_invalid_url() {
675        let result = fetch(FetchOptions::new("not a url"));
676        assert!(result.is_err());
677        let err = result.unwrap_err();
678        assert!(matches!(err, Error::InvalidUrl { .. }));
679    }
680
681    #[test]
682    fn fetch_rejects_private_ip() {
683        let result = fetch(FetchOptions::new("http://127.0.0.1/"));
684        assert!(result.is_err());
685    }
686
687    #[test]
688    fn fetch_rejects_file_scheme() {
689        let result = fetch(FetchOptions::new("file:///etc/passwd"));
690        assert!(result.is_err());
691    }
692}