Skip to main content

servo_fetch/
engine.rs

1//! Servo browser engine facade.
2
3use std::time::Duration;
4
5use crate::error::Error;
6
7/// Rendered page returned by [`fetch`].
8#[derive(Debug, Clone, Default, serde::Serialize)]
9#[non_exhaustive]
10pub struct Page {
11    /// Fully rendered HTML after JavaScript execution.
12    pub html: String,
13    /// Plain text content (`document.body.innerText`).
14    pub inner_text: String,
15    /// Page title extracted from `<title>` tag.
16    pub title: Option<String>,
17    /// Parsed layout data from the injected CSS heuristics script.
18    #[serde(skip_serializing_if = "Option::is_none")]
19    pub layout_json: Option<String>,
20    /// Result of JavaScript evaluation, if [`FetchOptions::javascript`] was used.
21    #[serde(skip_serializing_if = "Option::is_none")]
22    pub js_result: Option<String>,
23    /// Browser console messages captured during page load.
24    pub console_messages: Vec<ConsoleMessage>,
25    /// Accessibility tree (AccessKit), if requested.
26    #[serde(skip_serializing_if = "Option::is_none")]
27    pub accessibility_tree: Option<String>,
28    #[serde(skip)]
29    screenshot_png: Option<Vec<u8>>,
30}
31
32impl Page {
33    /// Extract readable Markdown from this page.
34    pub fn markdown(&self) -> crate::error::Result<String> {
35        self.markdown_with_url("")
36    }
37
38    /// Extract readable Markdown, using the original URL for link resolution.
39    pub fn markdown_with_url(&self, url: &str) -> crate::error::Result<String> {
40        let input = crate::extract::ExtractInput::new(&self.html, url)
41            .with_layout_json(self.layout_json.as_deref())
42            .with_inner_text(Some(&self.inner_text));
43        Ok(crate::extract::extract_text(&input)?)
44    }
45
46    /// Extract structured JSON from this page.
47    pub fn extract_json(&self) -> crate::error::Result<String> {
48        self.extract_json_with_url("")
49    }
50
51    /// Extract structured JSON, using the original URL for link resolution.
52    pub fn extract_json_with_url(&self, url: &str) -> crate::error::Result<String> {
53        let input = crate::extract::ExtractInput::new(&self.html, url)
54            .with_layout_json(self.layout_json.as_deref())
55            .with_inner_text(Some(&self.inner_text));
56        Ok(crate::extract::extract_json(&input)?)
57    }
58
59    /// Extract readable Markdown from the subtree matched by a CSS selector.
60    pub fn markdown_with_selector(&self, url: &str, selector: &str) -> crate::error::Result<String> {
61        let input = crate::extract::ExtractInput::new(&self.html, url)
62            .with_layout_json(self.layout_json.as_deref())
63            .with_inner_text(Some(&self.inner_text))
64            .with_selector(Some(selector));
65        Ok(crate::extract::extract_text(&input)?)
66    }
67
68    /// Extract structured JSON from the subtree matched by a CSS selector.
69    pub fn extract_json_with_selector(&self, url: &str, selector: &str) -> crate::error::Result<String> {
70        let input = crate::extract::ExtractInput::new(&self.html, url)
71            .with_layout_json(self.layout_json.as_deref())
72            .with_inner_text(Some(&self.inner_text))
73            .with_selector(Some(selector));
74        Ok(crate::extract::extract_json(&input)?)
75    }
76
77    /// PNG screenshot bytes, if captured via [`FetchOptions::screenshot`].
78    #[must_use]
79    pub fn screenshot_png(&self) -> Option<&[u8]> {
80        self.screenshot_png.as_deref()
81    }
82
83    pub(crate) fn from_servo(page: crate::bridge::ServoPage) -> Self {
84        let title = {
85            let doc = dom_query::Document::from(page.html.as_str());
86            let t = doc.select("title").text().to_string();
87            if t.is_empty() { None } else { Some(t) }
88        };
89        let screenshot_png = page.screenshot.and_then(|img| {
90            let mut buf = std::io::Cursor::new(Vec::new());
91            img.write_to(&mut buf, image::ImageFormat::Png).ok()?;
92            Some(buf.into_inner())
93        });
94        Self {
95            html: page.html,
96            inner_text: page.inner_text.unwrap_or_default(),
97            title,
98            layout_json: page.layout_json,
99            js_result: page.js_result,
100            console_messages: page
101                .console_messages
102                .into_iter()
103                .map(|m| ConsoleMessage {
104                    level: match m.level {
105                        crate::bridge::ConsoleLevel::Log => ConsoleLevel::Log,
106                        crate::bridge::ConsoleLevel::Debug => ConsoleLevel::Debug,
107                        crate::bridge::ConsoleLevel::Info => ConsoleLevel::Info,
108                        crate::bridge::ConsoleLevel::Warn => ConsoleLevel::Warn,
109                        crate::bridge::ConsoleLevel::Error => ConsoleLevel::Error,
110                        crate::bridge::ConsoleLevel::Trace => ConsoleLevel::Trace,
111                    },
112                    message: m.message,
113                })
114                .collect(),
115            screenshot_png,
116            accessibility_tree: page.accessibility_tree,
117        }
118    }
119}
120
121/// Browser console message captured during page load.
122#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
123#[non_exhaustive]
124pub struct ConsoleMessage {
125    /// Severity level.
126    pub level: ConsoleLevel,
127    /// Message text.
128    pub message: String,
129}
130
131/// Console message severity.
132#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
133#[serde(rename_all = "lowercase")]
134#[non_exhaustive]
135pub enum ConsoleLevel {
136    /// General log message.
137    Log,
138    /// Debug-level message.
139    Debug,
140    /// Informational message.
141    Info,
142    /// Warning message.
143    Warn,
144    /// Error message.
145    Error,
146    /// Trace-level message.
147    Trace,
148}
149
150impl ConsoleLevel {
151    /// Returns the string representation of this level.
152    #[must_use]
153    pub fn as_str(&self) -> &'static str {
154        match self {
155            Self::Log => "log",
156            Self::Debug => "debug",
157            Self::Info => "info",
158            Self::Warn => "warn",
159            Self::Error => "error",
160            Self::Trace => "trace",
161        }
162    }
163}
164
165impl std::fmt::Display for ConsoleLevel {
166    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
167        f.pad(self.as_str())
168    }
169}
170
171#[derive(Debug, Clone, Default)]
172pub(crate) enum FetchMode {
173    #[default]
174    Content,
175    Screenshot {
176        full_page: bool,
177    },
178    JavaScript(String),
179}
180
181/// Options for a single page fetch.
182///
183/// # Thread Safety
184///
185/// [`fetch`] is safe to call from multiple threads. Each call queues a request
186/// to the shared Servo engine thread, which processes them sequentially.
187#[must_use = "options do nothing until passed to fetch()"]
188#[derive(Debug, Clone)]
189pub struct FetchOptions {
190    pub(crate) url: String,
191    pub(crate) timeout: Duration,
192    pub(crate) settle: Duration,
193    pub(crate) mode: FetchMode,
194    pub(crate) user_agent: Option<String>,
195}
196
197impl FetchOptions {
198    /// Fetch rendered content (default mode).
199    pub fn new(url: &str) -> Self {
200        Self {
201            url: url.into(),
202            timeout: Duration::from_secs(30),
203            settle: Duration::ZERO,
204            mode: FetchMode::Content,
205            user_agent: None,
206        }
207    }
208
209    /// Capture a PNG screenshot.
210    pub fn screenshot(url: &str, full_page: bool) -> Self {
211        Self {
212            mode: FetchMode::Screenshot { full_page },
213            ..Self::new(url)
214        }
215    }
216
217    /// Execute a JavaScript expression and return the result.
218    pub fn javascript(url: &str, expression: impl Into<String>) -> Self {
219        Self {
220            mode: FetchMode::JavaScript(expression.into()),
221            ..Self::new(url)
222        }
223    }
224
225    /// Page load timeout (default: 30s).
226    pub fn timeout(mut self, timeout: Duration) -> Self {
227        self.timeout = timeout;
228        self
229    }
230
231    /// Extra wait after load event for SPA hydration (default: 0).
232    pub fn settle(mut self, settle: Duration) -> Self {
233        self.settle = settle;
234        self
235    }
236
237    /// Override the User-Agent string for this request.
238    pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
239        self.user_agent = Some(sanitize_user_agent(ua.into()));
240        self
241    }
242}
243
244/// Fetch a single page via the embedded Servo engine.
245///
246/// The first call spawns a persistent engine thread that lives for the process
247/// lifetime. If the engine thread panics, this returns [`Error::Engine`].
248#[allow(clippy::needless_pass_by_value)]
249pub fn fetch(opts: FetchOptions) -> crate::error::Result<Page> {
250    ensure_crypto_provider();
251
252    crate::net::validate_url(&opts.url, crate::bridge::engine_policy()).map_err(|e| map_url_error(&opts.url, e))?;
253
254    if matches!(opts.mode, FetchMode::Content)
255        && let Some(bytes) = crate::pdf::probe(&opts.url, opts.timeout.as_secs().max(1))
256    {
257        let text = crate::extract::extract_pdf(&bytes);
258        return Ok(Page {
259            html: String::new(),
260            inner_text: text,
261            ..Page::default()
262        });
263    }
264
265    let bridge_opts = crate::bridge::FetchOptions {
266        url: &opts.url,
267        timeout_secs: opts.timeout.as_secs().max(1),
268        settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
269        user_agent: opts.user_agent.as_deref(),
270        mode: match opts.mode {
271            FetchMode::Content => crate::bridge::FetchMode::Content { include_a11y: false },
272            FetchMode::Screenshot { full_page } => crate::bridge::FetchMode::Screenshot { full_page },
273            FetchMode::JavaScript(ref expr) => crate::bridge::FetchMode::ExecuteJs {
274                expression: expr.clone(),
275            },
276        },
277    };
278
279    let servo_page = crate::bridge::fetch_page(bridge_opts).map_err(|e| {
280        let msg = format!("{e:#}");
281        if msg.contains("timed out") {
282            Error::Timeout {
283                url: opts.url.clone(),
284                timeout: opts.timeout,
285            }
286        } else {
287            Error::Engine(msg)
288        }
289    })?;
290
291    Ok(Page::from_servo(servo_page))
292}
293
294/// Options for crawling a site.
295#[must_use = "options do nothing until passed to crawl() or crawl_each()"]
296#[derive(Debug, Clone)]
297pub struct CrawlOptions {
298    pub(crate) url: String,
299    pub(crate) limit: usize,
300    pub(crate) max_depth: usize,
301    pub(crate) timeout: Duration,
302    pub(crate) settle: Duration,
303    pub(crate) include: Vec<String>,
304    pub(crate) exclude: Vec<String>,
305    pub(crate) selector: Option<String>,
306    pub(crate) json: bool,
307    pub(crate) user_agent: Option<String>,
308}
309
310impl CrawlOptions {
311    /// Create crawl options for the given seed URL.
312    pub fn new(url: &str) -> Self {
313        Self {
314            url: url.into(),
315            limit: 50,
316            max_depth: 3,
317            timeout: Duration::from_secs(30),
318            settle: Duration::ZERO,
319            include: Vec::new(),
320            exclude: Vec::new(),
321            selector: None,
322            json: false,
323            user_agent: None,
324        }
325    }
326
327    /// Maximum number of pages to crawl (default: 50).
328    pub fn limit(mut self, n: usize) -> Self {
329        self.limit = n;
330        self
331    }
332
333    /// Maximum link depth from the seed URL (default: 3).
334    pub fn max_depth(mut self, n: usize) -> Self {
335        self.max_depth = n;
336        self
337    }
338
339    /// Page load timeout per page (default: 30s).
340    pub fn timeout(mut self, timeout: Duration) -> Self {
341        self.timeout = timeout;
342        self
343    }
344
345    /// Extra wait after load event per page (default: 0).
346    pub fn settle(mut self, settle: Duration) -> Self {
347        self.settle = settle;
348        self
349    }
350
351    /// URL path glob patterns to include (e.g. `"/docs/**"`).
352    pub fn include(mut self, patterns: &[&str]) -> Self {
353        self.include = patterns.iter().map(|s| (*s).to_string()).collect();
354        self
355    }
356
357    /// URL path glob patterns to exclude (e.g. `"/docs/archive/**"`).
358    pub fn exclude(mut self, patterns: &[&str]) -> Self {
359        self.exclude = patterns.iter().map(|s| (*s).to_string()).collect();
360        self
361    }
362
363    /// Output crawled content as JSON instead of Markdown.
364    pub fn json(mut self, json: bool) -> Self {
365        self.json = json;
366        self
367    }
368
369    /// CSS selector to extract a specific section per page.
370    pub fn selector(mut self, selector: impl Into<String>) -> Self {
371        self.selector = Some(selector.into());
372        self
373    }
374
375    /// Override the User-Agent string for all pages in this crawl.
376    pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
377        self.user_agent = Some(sanitize_user_agent(ua.into()));
378        self
379    }
380}
381
382/// Result for a single crawled page.
383#[derive(Debug, Clone)]
384#[non_exhaustive]
385pub struct CrawlResult {
386    /// URL of the crawled page.
387    pub url: String,
388    /// Link depth from the seed URL.
389    pub depth: usize,
390    /// Page content if successful, or error if failed.
391    pub outcome: Result<CrawlPage, CrawlError>,
392}
393
394/// Successfully crawled page.
395#[derive(Debug, Clone)]
396pub struct CrawlPage {
397    /// Page title.
398    pub title: Option<String>,
399    /// Extracted content (Markdown or JSON depending on options).
400    pub content: String,
401    /// Number of links discovered on this page.
402    pub links_found: usize,
403}
404
405/// Error from a failed crawl attempt.
406#[derive(Debug, Clone)]
407pub struct CrawlError {
408    /// Error message.
409    pub message: String,
410}
411
412impl std::fmt::Display for CrawlError {
413    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
414        f.write_str(&self.message)
415    }
416}
417
418impl std::error::Error for CrawlError {}
419
420impl serde::Serialize for CrawlResult {
421    fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
422        use serde::ser::SerializeMap;
423        match &self.outcome {
424            Ok(page) => {
425                let mut map = serializer.serialize_map(None)?;
426                map.serialize_entry("url", &self.url)?;
427                map.serialize_entry("depth", &self.depth)?;
428                map.serialize_entry("status", "ok")?;
429                if let Some(t) = &page.title {
430                    map.serialize_entry("title", t)?;
431                }
432                map.serialize_entry("content", &page.content)?;
433                map.serialize_entry("links_found", &page.links_found)?;
434                map.end()
435            }
436            Err(e) => {
437                let mut map = serializer.serialize_map(None)?;
438                map.serialize_entry("url", &self.url)?;
439                map.serialize_entry("depth", &self.depth)?;
440                map.serialize_entry("status", "error")?;
441                map.serialize_entry("error", &e.message)?;
442                map.end()
443            }
444        }
445    }
446}
447
448impl CrawlResult {
449    fn from_internal(r: &crate::crawl::CrawlPageResult) -> Self {
450        let outcome = match r.status {
451            crate::crawl::CrawlStatus::Ok => Ok(CrawlPage {
452                title: r.title.clone(),
453                content: r.content.clone().unwrap_or_default(),
454                links_found: r.links_found,
455            }),
456            crate::crawl::CrawlStatus::Error => Err(CrawlError {
457                message: r.error.clone().unwrap_or_default(),
458            }),
459        };
460        Self {
461            url: r.url.clone(),
462            depth: r.depth,
463            outcome,
464        }
465    }
466}
467
468/// Crawl a site, invoking `on_page` for each result as it arrives.
469#[allow(clippy::needless_pass_by_value)]
470pub fn crawl_each(opts: CrawlOptions, mut on_page: impl FnMut(&CrawlResult)) -> crate::error::Result<()> {
471    ensure_crypto_provider();
472    let internal_opts = build_crawl_options(&opts)?;
473    crate::runtime::block_on(async {
474        let robots = tokio::task::spawn_blocking({
475            let seed = internal_opts.seed.clone();
476            let user_agent = internal_opts.user_agent.clone();
477            let timeout = Duration::from_secs(internal_opts.timeout_secs);
478            move || crate::robots::RobotsRules::fetch(&seed, user_agent.as_deref(), timeout)
479        })
480        .await
481        .unwrap_or(crate::robots::RobotsPolicy::Unreachable);
482        crate::crawl::run(internal_opts, robots, &crate::bridge::ServoFetcher, |r| {
483            on_page(&CrawlResult::from_internal(r));
484        })
485        .await
486    })
487    .map_err(|e| Error::Engine(e.to_string()))?;
488    Ok(())
489}
490
491/// Crawl a site and collect all results.
492#[allow(clippy::needless_pass_by_value)]
493pub fn crawl(opts: CrawlOptions) -> crate::error::Result<Vec<CrawlResult>> {
494    let mut results = Vec::new();
495    crawl_each(opts, |r| results.push(r.clone()))?;
496    Ok(results)
497}
498
499/// Options for URL discovery (sitemap + link extraction, no rendering).
500#[must_use = "options do nothing until passed to map()"]
501#[derive(Debug, Clone)]
502pub struct MapOptions {
503    url: String,
504    limit: usize,
505    include: Vec<String>,
506    exclude: Vec<String>,
507    user_agent: Option<String>,
508    timeout: u64,
509    no_fallback: bool,
510}
511
512impl MapOptions {
513    /// Create map options for the given URL.
514    pub fn new(url: impl Into<String>) -> Self {
515        Self {
516            url: url.into(),
517            limit: 5000,
518            include: Vec::new(),
519            exclude: Vec::new(),
520            user_agent: None,
521            timeout: 30,
522            no_fallback: false,
523        }
524    }
525
526    /// Maximum number of URLs to discover.
527    pub fn limit(mut self, n: usize) -> Self {
528        self.limit = n;
529        self
530    }
531
532    /// URL path glob patterns to include.
533    pub fn include(mut self, patterns: &[&str]) -> Self {
534        self.include = patterns.iter().map(|s| (*s).to_string()).collect();
535        self
536    }
537
538    /// URL path glob patterns to exclude.
539    pub fn exclude(mut self, patterns: &[&str]) -> Self {
540        self.exclude = patterns.iter().map(|s| (*s).to_string()).collect();
541        self
542    }
543
544    /// Override the User-Agent string.
545    pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
546        self.user_agent = Some(ua.into());
547        self
548    }
549
550    /// Timeout in seconds per HTTP request.
551    pub fn timeout(mut self, secs: u64) -> Self {
552        self.timeout = secs;
553        self
554    }
555
556    /// Skip HTML link fallback if no sitemap is found.
557    pub fn no_fallback(mut self, yes: bool) -> Self {
558        self.no_fallback = yes;
559        self
560    }
561}
562
563/// A discovered URL from sitemap or link extraction.
564#[derive(Debug, Clone, serde::Serialize)]
565pub struct MappedUrl {
566    /// The discovered URL.
567    pub url: String,
568    /// Last modification date from sitemap, if available.
569    #[serde(skip_serializing_if = "Option::is_none")]
570    pub lastmod: Option<String>,
571}
572
573/// Discover URLs on a site via sitemaps and link extraction (no rendering).
574#[allow(clippy::needless_pass_by_value)]
575pub fn map(opts: MapOptions) -> crate::error::Result<Vec<MappedUrl>> {
576    ensure_crypto_provider();
577    let seed = url::Url::parse(&opts.url).map_err(|e| Error::InvalidUrl {
578        url: opts.url.clone(),
579        reason: e.to_string(),
580    })?;
581    crate::net::validate_url(seed.as_str(), crate::bridge::engine_policy()).map_err(|e| map_url_error(&opts.url, e))?;
582
583    let include = if opts.include.is_empty() {
584        None
585    } else {
586        Some(crate::scope::build_globset(&opts.include)?)
587    };
588    let exclude = if opts.exclude.is_empty() {
589        None
590    } else {
591        Some(crate::scope::build_globset(&opts.exclude)?)
592    };
593
594    let internal = crate::map::MapConfig {
595        seed,
596        limit: opts.limit,
597        include,
598        exclude,
599        user_agent: opts.user_agent,
600        timeout: Duration::from_secs(opts.timeout),
601        no_fallback: opts.no_fallback,
602    };
603
604    let mut results = Vec::new();
605    crate::runtime::block_on(crate::map::run(&internal, |entry| {
606        results.push(MappedUrl {
607            url: entry.url.clone(),
608            lastmod: entry.lastmod.clone(),
609        });
610    }))
611    .map_err(|e| Error::Engine(e.to_string()))?;
612    Ok(results)
613}
614
615/// Fetch a URL and return readable Markdown.
616pub fn markdown(url: &str) -> crate::error::Result<String> {
617    fetch(FetchOptions::new(url))?.markdown_with_url(url)
618}
619
620/// Fetch a URL and return structured JSON.
621pub fn extract_json(url: &str) -> crate::error::Result<String> {
622    fetch(FetchOptions::new(url))?.extract_json_with_url(url)
623}
624
625/// Fetch a URL and return plain text (`document.body.innerText`).
626pub fn text(url: &str) -> crate::error::Result<String> {
627    Ok(fetch(FetchOptions::new(url))?.inner_text)
628}
629
630/// Set the network policy. Must be called at most once, before any engine use.
631pub fn init(policy: crate::net::NetworkPolicy) {
632    crate::bridge::set_engine_policy(policy);
633}
634
635/// Validate a URL for fetching. Rejects disallowed schemes and private addresses
636/// based on the policy set via [`init`].
637pub fn validate_url(url: &str) -> crate::error::Result<url::Url> {
638    crate::net::validate_url(url, crate::bridge::engine_policy()).map_err(|e| map_url_error(url, e))
639}
640
641fn ensure_crypto_provider() {
642    let _ = rustls::crypto::aws_lc_rs::default_provider().install_default();
643}
644
645/// Replace CR, LF, and NUL with SP per RFC 9110.
646pub(crate) fn sanitize_user_agent(ua: String) -> String {
647    if ua.bytes().any(|b| b == b'\r' || b == b'\n' || b == 0) {
648        ua.replace(['\r', '\n', '\0'], " ")
649    } else {
650        ua
651    }
652}
653
654fn map_url_error(url: &str, e: crate::net::UrlError) -> Error {
655    match e {
656        crate::net::UrlError::PrivateAddress(host) => Error::AddressNotAllowed(host),
657        crate::net::UrlError::Invalid(reason) => Error::InvalidUrl {
658            url: url.into(),
659            reason,
660        },
661    }
662}
663
664fn build_crawl_options(opts: &CrawlOptions) -> crate::error::Result<crate::crawl::CrawlOptions> {
665    let seed =
666        crate::net::validate_url(&opts.url, crate::bridge::engine_policy()).map_err(|e| map_url_error(&opts.url, e))?;
667    let include = if opts.include.is_empty() {
668        None
669    } else {
670        Some(crate::scope::build_globset(&opts.include)?)
671    };
672    let exclude = if opts.exclude.is_empty() {
673        None
674    } else {
675        Some(crate::scope::build_globset(&opts.exclude)?)
676    };
677    Ok(crate::crawl::CrawlOptions {
678        seed,
679        limit: opts.limit,
680        max_depth: opts.max_depth,
681        timeout_secs: opts.timeout.as_secs().max(1),
682        settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
683        include,
684        exclude,
685        selector: opts.selector.clone(),
686        json: opts.json,
687        user_agent: opts.user_agent.clone(),
688    })
689}
690
691#[cfg(test)]
692mod tests {
693    use super::*;
694
695    #[test]
696    fn fetch_options_defaults() {
697        let opts = FetchOptions::new("https://example.com");
698        assert_eq!(opts.url, "https://example.com");
699        assert_eq!(opts.timeout, Duration::from_secs(30));
700        assert_eq!(opts.settle, Duration::ZERO);
701        assert!(matches!(opts.mode, FetchMode::Content));
702    }
703
704    #[test]
705    fn fetch_options_screenshot() {
706        let opts = FetchOptions::screenshot("https://example.com", true);
707        assert!(matches!(opts.mode, FetchMode::Screenshot { full_page: true }));
708    }
709
710    #[test]
711    fn fetch_options_javascript() {
712        let opts = FetchOptions::javascript("https://example.com", "document.title");
713        assert!(matches!(opts.mode, FetchMode::JavaScript(ref e) if e == "document.title"));
714    }
715
716    #[test]
717    fn fetch_options_chaining() {
718        let opts = FetchOptions::new("https://example.com")
719            .timeout(Duration::from_secs(60))
720            .settle(Duration::from_millis(500));
721        assert_eq!(opts.timeout, Duration::from_secs(60));
722        assert_eq!(opts.settle, Duration::from_millis(500));
723    }
724
725    #[test]
726    fn crawl_options_defaults() {
727        let opts = CrawlOptions::new("https://example.com");
728        assert_eq!(opts.url, "https://example.com");
729        assert_eq!(opts.limit, 50);
730        assert_eq!(opts.max_depth, 3);
731        assert_eq!(opts.timeout, Duration::from_secs(30));
732        assert!(opts.include.is_empty());
733        assert!(opts.exclude.is_empty());
734    }
735
736    #[test]
737    fn crawl_options_chaining() {
738        let opts = CrawlOptions::new("https://example.com")
739            .limit(100)
740            .max_depth(5)
741            .timeout(Duration::from_secs(60))
742            .include(&["/docs/**"])
743            .exclude(&["/docs/archive/**"]);
744        assert_eq!(opts.limit, 100);
745        assert_eq!(opts.max_depth, 5);
746        assert_eq!(opts.include, vec!["/docs/**"]);
747        assert_eq!(opts.exclude, vec!["/docs/archive/**"]);
748    }
749
750    #[test]
751    fn fetch_user_agent_set() {
752        let opts = FetchOptions::new("https://example.com").user_agent("MyBot/1.0");
753        assert_eq!(opts.user_agent.as_deref(), Some("MyBot/1.0"));
754    }
755
756    #[test]
757    fn fetch_user_agent_default_is_none() {
758        let opts = FetchOptions::new("https://example.com");
759        assert!(opts.user_agent.is_none());
760    }
761
762    #[test]
763    fn fetch_user_agent_sanitizes_crlf() {
764        let opts = FetchOptions::new("https://example.com").user_agent("Bot\r\nX-Evil: yes");
765        assert_eq!(opts.user_agent.as_deref(), Some("Bot  X-Evil: yes"));
766    }
767
768    #[test]
769    fn fetch_user_agent_sanitizes_null() {
770        let opts = FetchOptions::new("https://example.com").user_agent("Bot\0/1.0");
771        assert_eq!(opts.user_agent.as_deref(), Some("Bot /1.0"));
772    }
773
774    #[test]
775    fn fetch_user_agent_empty_string() {
776        let opts = FetchOptions::new("https://example.com").user_agent("");
777        assert_eq!(opts.user_agent.as_deref(), Some(""));
778    }
779
780    #[test]
781    fn crawl_user_agent_sanitizes_crlf() {
782        let opts = CrawlOptions::new("https://example.com").user_agent("Crawler\r\n/2.0");
783        assert_eq!(opts.user_agent.as_deref(), Some("Crawler  /2.0"));
784    }
785
786    #[test]
787    fn page_markdown_from_html() {
788        let page = Page {
789            html: "<html><head><title>Test</title></head><body><p>hello world</p></body></html>".into(),
790            inner_text: "hello world".into(),
791            ..Page::default()
792        };
793        let md = page.markdown().unwrap();
794        assert!(md.contains("hello world"));
795    }
796
797    #[test]
798    fn page_extract_json_produces_valid_json() {
799        let page = Page {
800            html: "<html><head><title>Test</title></head><body><p>content</p></body></html>".into(),
801            inner_text: "content".into(),
802            ..Page::default()
803        };
804        let json = page.extract_json().unwrap();
805        let _: serde_json::Value = serde_json::from_str(&json).expect("valid JSON");
806    }
807
808    #[test]
809    fn page_screenshot_png_none_by_default() {
810        let page = Page::default();
811        assert!(page.screenshot_png().is_none());
812    }
813
814    #[test]
815    fn page_markdown_with_selector_scopes_to_subtree() {
816        let page = Page {
817            html: "<html><body><article>keep</article><aside>drop</aside></body></html>".into(),
818            ..Page::default()
819        };
820        let md = page.markdown_with_selector("https://example.com", "article").unwrap();
821        assert!(md.contains("keep"));
822        assert!(!md.contains("drop"));
823    }
824
825    #[test]
826    fn page_extract_json_with_selector_includes_url() {
827        let page = Page {
828            html: "<html><body><article>scoped</article></body></html>".into(),
829            ..Page::default()
830        };
831        let json = page
832            .extract_json_with_selector("https://example.com/page", "article")
833            .unwrap();
834        let parsed: serde_json::Value = serde_json::from_str(&json).expect("valid JSON");
835        assert_eq!(parsed["url"].as_str(), Some("https://example.com/page"));
836        assert!(parsed["text_content"].as_str().unwrap().contains("scoped"));
837    }
838
839    #[test]
840    fn page_markdown_with_selector_no_match_returns_empty() {
841        let page = Page {
842            html: "<html><body><article>x</article></body></html>".into(),
843            ..Page::default()
844        };
845        let md = page.markdown_with_selector("", ".nonexistent").unwrap();
846        assert!(md.is_empty());
847    }
848
849    #[test]
850    fn page_markdown_with_invalid_selector_returns_error() {
851        let page = Page {
852            html: "<html><body><p>x</p></body></html>".into(),
853            ..Page::default()
854        };
855        let err = page.markdown_with_selector("", "###invalid[[[").unwrap_err();
856        assert!(err.to_string().contains("invalid CSS selector"));
857    }
858
859    #[test]
860    fn page_markdown_with_empty_selector_returns_error() {
861        let page = Page {
862            html: "<html><body><p>x</p></body></html>".into(),
863            ..Page::default()
864        };
865        assert!(page.markdown_with_selector("", "").is_err());
866    }
867
868    #[test]
869    fn fetch_rejects_invalid_url() {
870        let result = fetch(FetchOptions::new("not a url"));
871        assert!(result.is_err());
872        let err = result.unwrap_err();
873        assert!(matches!(err, Error::InvalidUrl { .. }));
874    }
875
876    #[test]
877    fn fetch_rejects_private_ip() {
878        let result = fetch(FetchOptions::new("http://127.0.0.1/"));
879        assert!(result.is_err());
880    }
881
882    #[test]
883    fn fetch_rejects_file_scheme() {
884        let result = fetch(FetchOptions::new("file:///etc/passwd"));
885        assert!(result.is_err());
886    }
887}