Skip to main content

servo_fetch/
engine.rs

1//! Servo browser engine facade.
2
3use std::time::Duration;
4
5use crate::error::Error;
6
7/// Rendered page returned by [`fetch`].
8#[derive(Debug, Clone, Default, serde::Serialize)]
9#[non_exhaustive]
10pub struct Page {
11    /// Fully rendered HTML after JavaScript execution.
12    pub html: String,
13    /// Plain text content (`document.body.innerText`).
14    pub inner_text: String,
15    /// Page title extracted from `<title>` tag.
16    pub title: Option<String>,
17    /// Parsed layout data from the injected CSS heuristics script.
18    #[serde(skip_serializing_if = "Option::is_none")]
19    pub layout_json: Option<String>,
20    /// Result of JavaScript evaluation, if [`FetchOptions::javascript`] was used.
21    #[serde(skip_serializing_if = "Option::is_none")]
22    pub js_result: Option<String>,
23    /// Browser console messages captured during page load.
24    pub console_messages: Vec<ConsoleMessage>,
25    /// Accessibility tree (AccessKit), if requested.
26    #[serde(skip_serializing_if = "Option::is_none")]
27    pub accessibility_tree: Option<String>,
28    #[serde(skip)]
29    screenshot_png: Option<Vec<u8>>,
30}
31
32impl Page {
33    /// Extract readable Markdown from this page.
34    pub fn markdown(&self) -> crate::error::Result<String> {
35        self.markdown_with_url("")
36    }
37
38    /// Extract readable Markdown, using the original URL for link resolution.
39    pub fn markdown_with_url(&self, url: &str) -> crate::error::Result<String> {
40        let input = crate::extract::ExtractInput::new(&self.html, url)
41            .with_layout_json(self.layout_json.as_deref())
42            .with_inner_text(Some(&self.inner_text));
43        Ok(crate::extract::extract_text(&input)?)
44    }
45
46    /// Extract structured JSON from this page.
47    pub fn extract_json(&self) -> crate::error::Result<String> {
48        self.extract_json_with_url("")
49    }
50
51    /// Extract structured JSON, using the original URL for link resolution.
52    pub fn extract_json_with_url(&self, url: &str) -> crate::error::Result<String> {
53        let input = crate::extract::ExtractInput::new(&self.html, url)
54            .with_layout_json(self.layout_json.as_deref())
55            .with_inner_text(Some(&self.inner_text));
56        Ok(crate::extract::extract_json(&input)?)
57    }
58
59    /// Extract readable Markdown from the subtree matched by a CSS selector.
60    pub fn markdown_with_selector(&self, url: &str, selector: &str) -> crate::error::Result<String> {
61        let input = crate::extract::ExtractInput::new(&self.html, url)
62            .with_layout_json(self.layout_json.as_deref())
63            .with_inner_text(Some(&self.inner_text))
64            .with_selector(Some(selector));
65        Ok(crate::extract::extract_text(&input)?)
66    }
67
68    /// Extract structured JSON from the subtree matched by a CSS selector.
69    pub fn extract_json_with_selector(&self, url: &str, selector: &str) -> crate::error::Result<String> {
70        let input = crate::extract::ExtractInput::new(&self.html, url)
71            .with_layout_json(self.layout_json.as_deref())
72            .with_inner_text(Some(&self.inner_text))
73            .with_selector(Some(selector));
74        Ok(crate::extract::extract_json(&input)?)
75    }
76
77    /// PNG screenshot bytes, if captured via [`FetchOptions::screenshot`].
78    #[must_use]
79    pub fn screenshot_png(&self) -> Option<&[u8]> {
80        self.screenshot_png.as_deref()
81    }
82
83    pub(crate) fn from_servo(page: crate::bridge::ServoPage) -> Self {
84        let title = {
85            let doc = dom_query::Document::from(page.html.as_str());
86            let t = doc.select("title").text().to_string();
87            if t.is_empty() { None } else { Some(t) }
88        };
89        let screenshot_png = page.screenshot.and_then(|img| {
90            let mut buf = std::io::Cursor::new(Vec::new());
91            img.write_to(&mut buf, image::ImageFormat::Png).ok()?;
92            Some(buf.into_inner())
93        });
94        Self {
95            html: page.html,
96            inner_text: page.inner_text.unwrap_or_default(),
97            title,
98            layout_json: page.layout_json,
99            js_result: page.js_result,
100            console_messages: page
101                .console_messages
102                .into_iter()
103                .map(|m| ConsoleMessage {
104                    level: match m.level {
105                        crate::bridge::ConsoleLevel::Log => ConsoleLevel::Log,
106                        crate::bridge::ConsoleLevel::Debug => ConsoleLevel::Debug,
107                        crate::bridge::ConsoleLevel::Info => ConsoleLevel::Info,
108                        crate::bridge::ConsoleLevel::Warn => ConsoleLevel::Warn,
109                        crate::bridge::ConsoleLevel::Error => ConsoleLevel::Error,
110                        crate::bridge::ConsoleLevel::Trace => ConsoleLevel::Trace,
111                    },
112                    message: m.message,
113                })
114                .collect(),
115            screenshot_png,
116            accessibility_tree: page.accessibility_tree,
117        }
118    }
119}
120
121/// Browser console message captured during page load.
122#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
123#[non_exhaustive]
124pub struct ConsoleMessage {
125    /// Severity level.
126    pub level: ConsoleLevel,
127    /// Message text.
128    pub message: String,
129}
130
131/// Console message severity.
132#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
133#[serde(rename_all = "lowercase")]
134#[non_exhaustive]
135pub enum ConsoleLevel {
136    /// General log message.
137    Log,
138    /// Debug-level message.
139    Debug,
140    /// Informational message.
141    Info,
142    /// Warning message.
143    Warn,
144    /// Error message.
145    Error,
146    /// Trace-level message.
147    Trace,
148}
149
150impl ConsoleLevel {
151    /// Returns the string representation of this level.
152    #[must_use]
153    pub fn as_str(&self) -> &'static str {
154        match self {
155            Self::Log => "log",
156            Self::Debug => "debug",
157            Self::Info => "info",
158            Self::Warn => "warn",
159            Self::Error => "error",
160            Self::Trace => "trace",
161        }
162    }
163}
164
165impl std::fmt::Display for ConsoleLevel {
166    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
167        f.pad(self.as_str())
168    }
169}
170
171#[derive(Debug, Clone, Default)]
172pub(crate) enum FetchMode {
173    #[default]
174    Content,
175    Screenshot {
176        full_page: bool,
177    },
178    JavaScript(String),
179}
180
181/// Options for a single page fetch.
182///
183/// # Thread Safety
184///
185/// [`fetch`] is safe to call from multiple threads. Each call queues a request
186/// to the shared Servo engine thread, which processes them sequentially.
187#[must_use = "options do nothing until passed to fetch()"]
188#[derive(Debug, Clone)]
189pub struct FetchOptions {
190    pub(crate) url: String,
191    pub(crate) timeout: Duration,
192    pub(crate) settle: Duration,
193    pub(crate) mode: FetchMode,
194    pub(crate) user_agent: Option<String>,
195}
196
197impl FetchOptions {
198    /// Fetch rendered content (default mode).
199    pub fn new(url: &str) -> Self {
200        Self {
201            url: url.into(),
202            timeout: Duration::from_secs(30),
203            settle: Duration::ZERO,
204            mode: FetchMode::Content,
205            user_agent: None,
206        }
207    }
208
209    /// Capture a PNG screenshot.
210    pub fn screenshot(url: &str, full_page: bool) -> Self {
211        Self {
212            mode: FetchMode::Screenshot { full_page },
213            ..Self::new(url)
214        }
215    }
216
217    /// Execute a JavaScript expression and return the result.
218    pub fn javascript(url: &str, expression: impl Into<String>) -> Self {
219        Self {
220            mode: FetchMode::JavaScript(expression.into()),
221            ..Self::new(url)
222        }
223    }
224
225    /// Page load timeout (default: 30s).
226    pub fn timeout(mut self, timeout: Duration) -> Self {
227        self.timeout = timeout;
228        self
229    }
230
231    /// Extra wait after load event for SPA hydration (default: 0).
232    pub fn settle(mut self, settle: Duration) -> Self {
233        self.settle = settle;
234        self
235    }
236
237    /// Override the User-Agent string for this request.
238    pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
239        self.user_agent = Some(sanitize_user_agent(ua.into()));
240        self
241    }
242}
243
244/// Fetch a single page via the embedded Servo engine.
245///
246/// The first call spawns a persistent engine thread that lives for the process
247/// lifetime. If the engine thread panics, this returns [`Error::Engine`].
248#[allow(clippy::needless_pass_by_value)]
249pub fn fetch(opts: FetchOptions) -> crate::error::Result<Page> {
250    ensure_crypto_provider();
251
252    crate::net::validate_url(&opts.url).map_err(|e| map_url_error(&opts.url, e))?;
253
254    if matches!(opts.mode, FetchMode::Content)
255        && let Some(bytes) = crate::pdf::probe(&opts.url, opts.timeout.as_secs().max(1))
256    {
257        let text = crate::extract::extract_pdf(&bytes);
258        return Ok(Page {
259            html: String::new(),
260            inner_text: text,
261            ..Page::default()
262        });
263    }
264
265    let bridge_opts = crate::bridge::FetchOptions {
266        url: &opts.url,
267        timeout_secs: opts.timeout.as_secs().max(1),
268        settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
269        user_agent: opts.user_agent.as_deref(),
270        mode: match opts.mode {
271            FetchMode::Content => crate::bridge::FetchMode::Content { include_a11y: false },
272            FetchMode::Screenshot { full_page } => crate::bridge::FetchMode::Screenshot { full_page },
273            FetchMode::JavaScript(ref expr) => crate::bridge::FetchMode::ExecuteJs {
274                expression: expr.clone(),
275            },
276        },
277    };
278
279    let servo_page = crate::bridge::fetch_page(bridge_opts).map_err(|e| {
280        let msg = format!("{e:#}");
281        if msg.contains("timed out") {
282            Error::Timeout {
283                url: opts.url.clone(),
284                timeout: opts.timeout,
285            }
286        } else {
287            Error::Engine(msg)
288        }
289    })?;
290
291    Ok(Page::from_servo(servo_page))
292}
293
294/// Options for crawling a site.
295#[must_use = "options do nothing until passed to crawl() or crawl_each()"]
296#[derive(Debug, Clone)]
297pub struct CrawlOptions {
298    pub(crate) url: String,
299    pub(crate) limit: usize,
300    pub(crate) max_depth: usize,
301    pub(crate) timeout: Duration,
302    pub(crate) settle: Duration,
303    pub(crate) include: Vec<String>,
304    pub(crate) exclude: Vec<String>,
305    pub(crate) selector: Option<String>,
306    pub(crate) json: bool,
307    pub(crate) user_agent: Option<String>,
308}
309
310impl CrawlOptions {
311    /// Create crawl options for the given seed URL.
312    pub fn new(url: &str) -> Self {
313        Self {
314            url: url.into(),
315            limit: 50,
316            max_depth: 3,
317            timeout: Duration::from_secs(30),
318            settle: Duration::ZERO,
319            include: Vec::new(),
320            exclude: Vec::new(),
321            selector: None,
322            json: false,
323            user_agent: None,
324        }
325    }
326
327    /// Maximum number of pages to crawl (default: 50).
328    pub fn limit(mut self, n: usize) -> Self {
329        self.limit = n;
330        self
331    }
332
333    /// Maximum link depth from the seed URL (default: 3).
334    pub fn max_depth(mut self, n: usize) -> Self {
335        self.max_depth = n;
336        self
337    }
338
339    /// Page load timeout per page (default: 30s).
340    pub fn timeout(mut self, timeout: Duration) -> Self {
341        self.timeout = timeout;
342        self
343    }
344
345    /// Extra wait after load event per page (default: 0).
346    pub fn settle(mut self, settle: Duration) -> Self {
347        self.settle = settle;
348        self
349    }
350
351    /// URL path glob patterns to include (e.g. `"/docs/**"`).
352    pub fn include(mut self, patterns: &[&str]) -> Self {
353        self.include = patterns.iter().map(|s| (*s).to_string()).collect();
354        self
355    }
356
357    /// URL path glob patterns to exclude (e.g. `"/docs/archive/**"`).
358    pub fn exclude(mut self, patterns: &[&str]) -> Self {
359        self.exclude = patterns.iter().map(|s| (*s).to_string()).collect();
360        self
361    }
362
363    /// Output crawled content as JSON instead of Markdown.
364    pub fn json(mut self, json: bool) -> Self {
365        self.json = json;
366        self
367    }
368
369    /// CSS selector to extract a specific section per page.
370    pub fn selector(mut self, selector: impl Into<String>) -> Self {
371        self.selector = Some(selector.into());
372        self
373    }
374
375    /// Override the User-Agent string for all pages in this crawl.
376    pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
377        self.user_agent = Some(sanitize_user_agent(ua.into()));
378        self
379    }
380}
381
382/// Result for a single crawled page.
383#[derive(Debug, Clone)]
384#[non_exhaustive]
385pub struct CrawlResult {
386    /// URL of the crawled page.
387    pub url: String,
388    /// Link depth from the seed URL.
389    pub depth: usize,
390    /// Page content if successful, or error if failed.
391    pub outcome: Result<CrawlPage, CrawlError>,
392}
393
394/// Successfully crawled page.
395#[derive(Debug, Clone)]
396pub struct CrawlPage {
397    /// Page title.
398    pub title: Option<String>,
399    /// Extracted content (Markdown or JSON depending on options).
400    pub content: String,
401    /// Number of links discovered on this page.
402    pub links_found: usize,
403}
404
405/// Error from a failed crawl attempt.
406#[derive(Debug, Clone)]
407pub struct CrawlError {
408    /// Error message.
409    pub message: String,
410}
411
412impl std::fmt::Display for CrawlError {
413    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
414        f.write_str(&self.message)
415    }
416}
417
418impl std::error::Error for CrawlError {}
419
420impl serde::Serialize for CrawlResult {
421    fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
422        use serde::ser::SerializeMap;
423        match &self.outcome {
424            Ok(page) => {
425                let mut map = serializer.serialize_map(None)?;
426                map.serialize_entry("url", &self.url)?;
427                map.serialize_entry("depth", &self.depth)?;
428                map.serialize_entry("status", "ok")?;
429                if let Some(t) = &page.title {
430                    map.serialize_entry("title", t)?;
431                }
432                map.serialize_entry("content", &page.content)?;
433                map.serialize_entry("links_found", &page.links_found)?;
434                map.end()
435            }
436            Err(e) => {
437                let mut map = serializer.serialize_map(None)?;
438                map.serialize_entry("url", &self.url)?;
439                map.serialize_entry("depth", &self.depth)?;
440                map.serialize_entry("status", "error")?;
441                map.serialize_entry("error", &e.message)?;
442                map.end()
443            }
444        }
445    }
446}
447
448impl CrawlResult {
449    fn from_internal(r: &crate::crawl::CrawlPageResult) -> Self {
450        let outcome = match r.status {
451            crate::crawl::CrawlStatus::Ok => Ok(CrawlPage {
452                title: r.title.clone(),
453                content: r.content.clone().unwrap_or_default(),
454                links_found: r.links_found,
455            }),
456            crate::crawl::CrawlStatus::Error => Err(CrawlError {
457                message: r.error.clone().unwrap_or_default(),
458            }),
459        };
460        Self {
461            url: r.url.clone(),
462            depth: r.depth,
463            outcome,
464        }
465    }
466}
467
468/// Crawl a site, invoking `on_page` for each result as it arrives.
469#[allow(clippy::needless_pass_by_value)]
470pub fn crawl_each(opts: CrawlOptions, mut on_page: impl FnMut(&CrawlResult)) -> crate::error::Result<()> {
471    ensure_crypto_provider();
472    let internal_opts = build_crawl_options(&opts)?;
473    crate::runtime::block_on(crate::crawl::run(internal_opts, |r| {
474        on_page(&CrawlResult::from_internal(r));
475    }))
476    .map_err(|e| Error::Engine(e.to_string()))?;
477    Ok(())
478}
479
480/// Crawl a site and collect all results.
481#[allow(clippy::needless_pass_by_value)]
482pub fn crawl(opts: CrawlOptions) -> crate::error::Result<Vec<CrawlResult>> {
483    let mut results = Vec::new();
484    crawl_each(opts, |r| results.push(r.clone()))?;
485    Ok(results)
486}
487
488/// Fetch a URL and return readable Markdown.
489pub fn markdown(url: &str) -> crate::error::Result<String> {
490    fetch(FetchOptions::new(url))?.markdown_with_url(url)
491}
492
493/// Fetch a URL and return structured JSON.
494pub fn extract_json(url: &str) -> crate::error::Result<String> {
495    fetch(FetchOptions::new(url))?.extract_json_with_url(url)
496}
497
498/// Fetch a URL and return plain text (`document.body.innerText`).
499pub fn text(url: &str) -> crate::error::Result<String> {
500    Ok(fetch(FetchOptions::new(url))?.inner_text)
501}
502
503/// Validate a URL for fetching. Rejects disallowed schemes and private addresses.
504pub fn validate_url(url: &str) -> crate::error::Result<url::Url> {
505    crate::net::validate_url(url).map_err(|e| map_url_error(url, e))
506}
507
508fn ensure_crypto_provider() {
509    let _ = rustls::crypto::aws_lc_rs::default_provider().install_default();
510}
511
512/// Replace CR, LF, and NUL with SP per RFC 9110.
513pub(crate) fn sanitize_user_agent(ua: String) -> String {
514    if ua.bytes().any(|b| b == b'\r' || b == b'\n' || b == 0) {
515        ua.replace(['\r', '\n', '\0'], " ")
516    } else {
517        ua
518    }
519}
520
521fn map_url_error(url: &str, e: crate::net::UrlError) -> Error {
522    match e {
523        crate::net::UrlError::PrivateAddress(host) => Error::AddressNotAllowed(host),
524        crate::net::UrlError::Invalid(reason) => Error::InvalidUrl {
525            url: url.into(),
526            reason,
527        },
528    }
529}
530
531fn build_crawl_options(opts: &CrawlOptions) -> crate::error::Result<crate::crawl::CrawlOptions> {
532    let seed = crate::net::validate_url(&opts.url).map_err(|e| map_url_error(&opts.url, e))?;
533    let include = if opts.include.is_empty() {
534        None
535    } else {
536        Some(crate::crawl::build_globset(&opts.include).map_err(|e| Error::Engine(e.to_string()))?)
537    };
538    let exclude = if opts.exclude.is_empty() {
539        None
540    } else {
541        Some(crate::crawl::build_globset(&opts.exclude).map_err(|e| Error::Engine(e.to_string()))?)
542    };
543    Ok(crate::crawl::CrawlOptions {
544        seed,
545        limit: opts.limit,
546        max_depth: opts.max_depth,
547        timeout_secs: opts.timeout.as_secs().max(1),
548        settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
549        include,
550        exclude,
551        selector: opts.selector.clone(),
552        json: opts.json,
553        user_agent: opts.user_agent.clone(),
554    })
555}
556
557#[cfg(test)]
558mod tests {
559    use super::*;
560
561    #[test]
562    fn fetch_options_defaults() {
563        let opts = FetchOptions::new("https://example.com");
564        assert_eq!(opts.url, "https://example.com");
565        assert_eq!(opts.timeout, Duration::from_secs(30));
566        assert_eq!(opts.settle, Duration::ZERO);
567        assert!(matches!(opts.mode, FetchMode::Content));
568    }
569
570    #[test]
571    fn fetch_options_screenshot() {
572        let opts = FetchOptions::screenshot("https://example.com", true);
573        assert!(matches!(opts.mode, FetchMode::Screenshot { full_page: true }));
574    }
575
576    #[test]
577    fn fetch_options_javascript() {
578        let opts = FetchOptions::javascript("https://example.com", "document.title");
579        assert!(matches!(opts.mode, FetchMode::JavaScript(ref e) if e == "document.title"));
580    }
581
582    #[test]
583    fn fetch_options_chaining() {
584        let opts = FetchOptions::new("https://example.com")
585            .timeout(Duration::from_secs(60))
586            .settle(Duration::from_millis(500));
587        assert_eq!(opts.timeout, Duration::from_secs(60));
588        assert_eq!(opts.settle, Duration::from_millis(500));
589    }
590
591    #[test]
592    fn crawl_options_defaults() {
593        let opts = CrawlOptions::new("https://example.com");
594        assert_eq!(opts.url, "https://example.com");
595        assert_eq!(opts.limit, 50);
596        assert_eq!(opts.max_depth, 3);
597        assert_eq!(opts.timeout, Duration::from_secs(30));
598        assert!(opts.include.is_empty());
599        assert!(opts.exclude.is_empty());
600    }
601
602    #[test]
603    fn crawl_options_chaining() {
604        let opts = CrawlOptions::new("https://example.com")
605            .limit(100)
606            .max_depth(5)
607            .timeout(Duration::from_secs(60))
608            .include(&["/docs/**"])
609            .exclude(&["/docs/archive/**"]);
610        assert_eq!(opts.limit, 100);
611        assert_eq!(opts.max_depth, 5);
612        assert_eq!(opts.include, vec!["/docs/**"]);
613        assert_eq!(opts.exclude, vec!["/docs/archive/**"]);
614    }
615
616    #[test]
617    fn fetch_user_agent_set() {
618        let opts = FetchOptions::new("https://example.com").user_agent("MyBot/1.0");
619        assert_eq!(opts.user_agent.as_deref(), Some("MyBot/1.0"));
620    }
621
622    #[test]
623    fn fetch_user_agent_default_is_none() {
624        let opts = FetchOptions::new("https://example.com");
625        assert!(opts.user_agent.is_none());
626    }
627
628    #[test]
629    fn fetch_user_agent_sanitizes_crlf() {
630        let opts = FetchOptions::new("https://example.com").user_agent("Bot\r\nX-Evil: yes");
631        assert_eq!(opts.user_agent.as_deref(), Some("Bot  X-Evil: yes"));
632    }
633
634    #[test]
635    fn fetch_user_agent_sanitizes_null() {
636        let opts = FetchOptions::new("https://example.com").user_agent("Bot\0/1.0");
637        assert_eq!(opts.user_agent.as_deref(), Some("Bot /1.0"));
638    }
639
640    #[test]
641    fn fetch_user_agent_empty_string() {
642        let opts = FetchOptions::new("https://example.com").user_agent("");
643        assert_eq!(opts.user_agent.as_deref(), Some(""));
644    }
645
646    #[test]
647    fn crawl_user_agent_sanitizes_crlf() {
648        let opts = CrawlOptions::new("https://example.com").user_agent("Crawler\r\n/2.0");
649        assert_eq!(opts.user_agent.as_deref(), Some("Crawler  /2.0"));
650    }
651
652    #[test]
653    fn page_markdown_from_html() {
654        let page = Page {
655            html: "<html><head><title>Test</title></head><body><p>hello world</p></body></html>".into(),
656            inner_text: "hello world".into(),
657            ..Page::default()
658        };
659        let md = page.markdown().unwrap();
660        assert!(md.contains("hello world"));
661    }
662
663    #[test]
664    fn page_extract_json_produces_valid_json() {
665        let page = Page {
666            html: "<html><head><title>Test</title></head><body><p>content</p></body></html>".into(),
667            inner_text: "content".into(),
668            ..Page::default()
669        };
670        let json = page.extract_json().unwrap();
671        let _: serde_json::Value = serde_json::from_str(&json).expect("valid JSON");
672    }
673
674    #[test]
675    fn page_screenshot_png_none_by_default() {
676        let page = Page::default();
677        assert!(page.screenshot_png().is_none());
678    }
679
680    #[test]
681    fn page_markdown_with_selector_scopes_to_subtree() {
682        let page = Page {
683            html: "<html><body><article>keep</article><aside>drop</aside></body></html>".into(),
684            ..Page::default()
685        };
686        let md = page.markdown_with_selector("https://example.com", "article").unwrap();
687        assert!(md.contains("keep"));
688        assert!(!md.contains("drop"));
689    }
690
691    #[test]
692    fn page_extract_json_with_selector_includes_url() {
693        let page = Page {
694            html: "<html><body><article>scoped</article></body></html>".into(),
695            ..Page::default()
696        };
697        let json = page
698            .extract_json_with_selector("https://example.com/page", "article")
699            .unwrap();
700        let parsed: serde_json::Value = serde_json::from_str(&json).expect("valid JSON");
701        assert_eq!(parsed["url"].as_str(), Some("https://example.com/page"));
702        assert!(parsed["text_content"].as_str().unwrap().contains("scoped"));
703    }
704
705    #[test]
706    fn page_markdown_with_selector_no_match_returns_empty() {
707        let page = Page {
708            html: "<html><body><article>x</article></body></html>".into(),
709            ..Page::default()
710        };
711        let md = page.markdown_with_selector("", ".nonexistent").unwrap();
712        assert!(md.is_empty());
713    }
714
715    #[test]
716    fn page_markdown_with_invalid_selector_returns_error() {
717        let page = Page {
718            html: "<html><body><p>x</p></body></html>".into(),
719            ..Page::default()
720        };
721        let err = page.markdown_with_selector("", "###invalid[[[").unwrap_err();
722        assert!(err.to_string().contains("invalid CSS selector"));
723    }
724
725    #[test]
726    fn page_markdown_with_empty_selector_returns_error() {
727        let page = Page {
728            html: "<html><body><p>x</p></body></html>".into(),
729            ..Page::default()
730        };
731        assert!(page.markdown_with_selector("", "").is_err());
732    }
733
734    #[test]
735    fn fetch_rejects_invalid_url() {
736        let result = fetch(FetchOptions::new("not a url"));
737        assert!(result.is_err());
738        let err = result.unwrap_err();
739        assert!(matches!(err, Error::InvalidUrl { .. }));
740    }
741
742    #[test]
743    fn fetch_rejects_private_ip() {
744        let result = fetch(FetchOptions::new("http://127.0.0.1/"));
745        assert!(result.is_err());
746    }
747
748    #[test]
749    fn fetch_rejects_file_scheme() {
750        let result = fetch(FetchOptions::new("file:///etc/passwd"));
751        assert!(result.is_err());
752    }
753}