servo-fetch 0.5.0

A browser engine in a binary. Fetch, render, and extract web content powered by Servo.
Documentation
//! CLI argument parsing.

use clap::Parser;

#[derive(Parser)]
#[command(
    name = "servo-fetch",
    version,
    about = "A browser engine in a binary — fetch, render, and extract web content.",
    after_help = "\
Examples:
  servo-fetch https://example.com              Readable Markdown (default)
  servo-fetch https://example.com --json       Structured JSON
  servo-fetch URL1 URL2 URL3                   Parallel batch fetch
  servo-fetch URL1 URL2 --json                 Parallel batch (NDJSON)
  servo-fetch https://example.com --screenshot page.png
  servo-fetch https://example.com --js \"document.title\"
  servo-fetch https://example.com -t 60        Custom timeout (seconds)
  servo-fetch https://example.com --selector article  Extract specific section
  servo-fetch mcp                              Start MCP server (stdio)"
)]
pub(crate) struct Cli {
    #[command(subcommand)]
    pub command: Option<Command>,

    /// URLs to fetch (one or more)
    #[arg(num_args = 1..)]
    pub urls: Vec<String>,

    /// Output as structured JSON (NDJSON when multiple URLs)
    #[arg(long, conflicts_with_all = ["screenshot", "js"])]
    pub json: bool,

    /// Save screenshot as PNG (single URL only)
    #[arg(long, value_name = "FILE", conflicts_with_all = ["json", "js"])]
    pub screenshot: Option<String>,

    /// Capture the full scrollable page instead of just the viewport.
    #[arg(long, requires = "screenshot")]
    pub full_page: bool,

    /// Execute JavaScript and print the result (single URL only)
    #[arg(long, value_name = "EXPR", conflicts_with_all = ["json", "screenshot"])]
    pub js: Option<String>,

    /// Timeout in seconds for page load
    #[arg(short = 't', long, default_value_t = 30, value_parser = clap::value_parser!(u64).range(1..), value_name = "SECS")]
    pub timeout: u64,

    /// Extra wait in ms after the `load` event, for SPAs that keep hydrating.
    #[arg(long, default_value_t = 0, value_parser = clap::value_parser!(u64).range(0..=10_000), value_name = "MS")]
    pub settle: u64,

    /// CSS selector to extract a specific section
    #[arg(long, value_name = "CSS")]
    pub selector: Option<String>,

    /// Output raw HTML or plain text instead of Readability extraction
    #[arg(long, value_name = "MODE", value_enum, conflicts_with_all = ["json", "screenshot", "js", "selector"])]
    pub raw: Option<RawMode>,
}

/// Raw output mode.
#[derive(Debug, Clone, Copy, clap::ValueEnum)]
pub(crate) enum RawMode {
    /// Raw HTML
    Html,
    /// Plain text (document.body.innerText)
    Text,
}

/// Available subcommands.
#[derive(clap::Subcommand)]
pub(crate) enum Command {
    /// Start MCP server (stdio transport by default, or HTTP with --port)
    Mcp {
        /// Port for Streamable HTTP transport. Omit for stdio.
        #[arg(long, value_name = "PORT")]
        port: Option<u16>,
    },
    /// Crawl a website by following links (BFS). Respects robots.txt.
    Crawl {
        /// Starting URL to crawl
        url: String,

        /// Maximum number of pages to crawl
        #[arg(long, default_value_t = 50, value_name = "N")]
        limit: usize,

        /// Maximum link depth from the seed URL
        #[arg(long, default_value_t = 3, value_name = "N")]
        max_depth: usize,

        /// URL path glob patterns to include (e.g. "/docs/**")
        #[arg(long, value_name = "GLOB")]
        include: Vec<String>,

        /// URL path glob patterns to exclude (e.g. "/docs/archive/**")
        #[arg(long, value_name = "GLOB")]
        exclude: Vec<String>,

        /// Output as NDJSON
        #[arg(long)]
        json: bool,

        /// CSS selector to extract a specific section per page
        #[arg(long, value_name = "CSS")]
        selector: Option<String>,

        /// Timeout in seconds per page
        #[arg(short = 't', long, default_value_t = 30, value_parser = clap::value_parser!(u64).range(1..), value_name = "SECS")]
        timeout: u64,

        /// Extra wait in ms after load event per page
        #[arg(long, default_value_t = 0, value_parser = clap::value_parser!(u64).range(0..=10_000), value_name = "MS")]
        settle: u64,
    },
}

/// Validate and sanitize a URL for fetching. Forwards to [`crate::net::validate_url`].
pub(crate) fn validate_url(input: &str) -> anyhow::Result<url::Url> {
    crate::net::validate_url(input)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn accepts_https() {
        assert!(validate_url("https://example.com").is_ok());
    }

    #[test]
    fn accepts_http() {
        assert!(validate_url("http://example.com").is_ok());
    }

    #[test]
    fn rejects_file_scheme() {
        let err = validate_url("file:///etc/passwd").unwrap_err();
        assert!(err.to_string().contains("not allowed"));
    }

    #[test]
    fn rejects_javascript_scheme() {
        let err = validate_url("javascript:alert(1)").unwrap_err();
        assert!(err.to_string().contains("not allowed"));
    }

    #[test]
    fn strips_credentials() {
        let url = validate_url("https://user:pass@example.com").unwrap();
        assert!(url.username().is_empty());
        assert!(url.password().is_none());
    }

    #[test]
    fn rejects_invalid_url() {
        assert!(validate_url("not a url").is_err());
    }

    #[test]
    fn rejects_private_host_via_url() {
        assert!(validate_url("http://127.0.0.1/").is_err());
    }

    #[test]
    fn rejects_hex_ip() {
        // url::Url::parse normalizes 0x7f000001 → 127.0.0.1
        assert!(validate_url("http://0x7f000001/").is_err());
    }

    #[test]
    fn rejects_decimal_ip() {
        assert!(validate_url("http://2130706433/").is_err());
    }

    #[test]
    fn rejects_data_scheme() {
        assert!(
            validate_url("data:text/html,<h1>hi</h1>")
                .unwrap_err()
                .to_string()
                .contains("not allowed")
        );
    }

    #[test]
    fn raw_mode_from_str() {
        use clap::ValueEnum;
        assert!(RawMode::from_str("html", true).is_ok());
        assert!(RawMode::from_str("text", true).is_ok());
        assert!(RawMode::from_str("xml", true).is_err());
    }
}

#[cfg(test)]
mod cli_tests {
    use assert_cmd::Command;
    use predicates::prelude::*;

    fn servo_fetch() -> Command {
        Command::cargo_bin("servo-fetch").expect("binary exists")
    }

    #[test]
    fn conflicting_json_and_screenshot() {
        servo_fetch()
            .args(["--json", "--screenshot", "out.png", "https://example.com"])
            .assert()
            .failure()
            .stderr(predicate::str::contains("cannot be used with"));
    }

    #[test]
    fn settle_rejects_out_of_range() {
        servo_fetch()
            .args(["--settle", "10001", "https://example.com"])
            .assert()
            .failure()
            .stderr(predicate::str::contains("invalid value"));
    }

    #[test]
    fn raw_conflicts_with_json() {
        servo_fetch()
            .args(["--raw", "html", "--json", "https://example.com"])
            .assert()
            .failure()
            .stderr(predicate::str::contains("cannot be used with"));
    }

    #[test]
    fn full_page_requires_screenshot() {
        servo_fetch()
            .args(["--full-page", "https://example.com"])
            .assert()
            .failure()
            .stderr(predicate::str::contains("--screenshot"));
    }
}