servo-fetch-cli 0.11.4

A browser engine in a binary — fetch, render, and extract web content as Markdown, JSON, or screenshots. Powered by Servo.
//! Default fetch command — single URL, batch, and PDF probe.

use std::fs;
use std::io::{self, Write as _};
use std::path::Path;
use std::time::Duration;

use anyhow::{Error, Result, bail};
use servo_fetch::{FetchOptions, Page};

use crate::cli::{FetchArgs, Format};
use crate::output::{self, Sink};
use crate::progress::Progress;

/// Fetch one or more URLs and write the rendered output to stdout, a file, or a directory.
pub(crate) fn run(args: &FetchArgs) -> Result<()> {
    validate_args(args)?;
    if let Some(dir) = args.output_dir.as_deref() {
        fs::create_dir_all(dir)?;
    }
    if let Some(file) = args.output.as_deref() {
        if let Some(parent) = file.parent().filter(|p| !p.as_os_str().is_empty()) {
            fs::create_dir_all(parent)?;
        }
    }
    match args.urls.as_slice() {
        [] => bail!("URL is required. Run with --help for usage."),
        [one] => run_single(args, one),
        many => {
            let rt = tokio::runtime::Runtime::new()?;
            rt.block_on(run_batch(args, many))
        }
    }
}

pub(crate) fn validate_args(args: &FetchArgs) -> Result<()> {
    let raw_format = matches!(args.format, Format::Html | Format::Text);
    if raw_format && args.selector.is_some() {
        bail!("--selector cannot be used with --format html or text");
    }
    if args.format == Format::Png {
        if args.selector.is_some() {
            bail!("--selector cannot be used with --format png");
        }
        if args.urls.len() > 1 {
            bail!("--format png only supports a single URL");
        }
        if args.output_dir.is_some() {
            bail!("--format png cannot be used with --output-dir");
        }
    }
    if args.full_page && args.format != Format::Png {
        bail!("--full-page requires --format png");
    }
    if args.urls.len() > 1 {
        if args.output.is_some() {
            bail!("-o/--output is only valid with a single URL; use --output-dir for multiple URLs");
        }
        if args.js.is_some() || raw_format {
            bail!("--js, and --format html or text cannot be used with multiple URLs");
        }
    }
    Ok(())
}

fn sink(args: &FetchArgs) -> Sink<'_> {
    Sink::from_args(args.output.as_deref(), args.output_dir.as_deref())
}

fn run_single(args: &FetchArgs, url_str: &str) -> Result<()> {
    let progress = Progress::new();
    progress.ticker(&format!("Fetching {url_str}..."));

    let opts = build_fetch_options(args, url_str)?;
    let page = servo_fetch::fetch(opts).map_err(Error::from);
    progress.clear();
    let page = page?;
    dispatch_output(args, &page, url_str, sink(args))
}

async fn run_batch(args: &FetchArgs, urls: &[String]) -> Result<()> {
    let total = urls.len();
    let progress = Progress::new();
    progress.header(&format!("Fetching {total} URLs..."));

    let schema = args.schema.as_ref().map(|p| load_schema(p)).transpose()?;

    let sem = std::sync::Arc::new(tokio::sync::Semaphore::new(4));
    let (tx, mut rx) = tokio::sync::mpsc::channel::<(String, std::result::Result<Page, servo_fetch::Error>)>(total);

    for url in urls {
        let permit = sem.clone().acquire_owned().await?;
        let tx = tx.clone();
        let url_str = url.clone();
        let timeout = args.timeout;
        let settle = args.settle;
        let user_agent = args.user_agent.clone();
        let schema = schema.clone();
        let visibility = args.visibility.to_policy();
        tokio::task::spawn_blocking(move || {
            let mut opts = FetchOptions::new(&url_str)
                .timeout(Duration::from_secs(timeout))
                .settle(Duration::from_millis(settle))
                .visibility(visibility);
            if let Some(ua) = user_agent {
                opts = opts.user_agent(ua);
            }
            if let Some(s) = schema {
                opts = opts.schema(s);
            }
            let result = servo_fetch::fetch(opts);
            let _ = tx.blocking_send((url_str, result));
            drop(permit);
        });
    }
    drop(tx);

    let sink = sink(args);
    let mut completed = 0usize;
    let mut failures = 0usize;
    while let Some((url, result)) = rx.recv().await {
        completed += 1;
        match result {
            Ok(page) => {
                batch_emit(args, &page, &url, sink)?;
                progress.item_done(completed, Some(total), &url, true);
            }
            Err(err) => {
                failures += 1;
                tracing::error!(url = %url, "{err:#}");
            }
        }
    }

    if failures == total {
        bail!("all {total} URLs failed");
    }
    Ok(())
}

fn batch_emit(args: &FetchArgs, page: &Page, url: &str, sink: Sink<'_>) -> Result<()> {
    if args.schema.is_some() {
        return output::Extracted { page, url }.execute_compact(sink);
    }
    let selector = args.selector.as_deref();
    match args.format {
        Format::Json => output::Json { page, url, selector }.execute_compact(sink),
        Format::Markdown => {
            if sink.is_stdout() {
                writeln!(io::stdout(), "--- {url} ---")?;
                output::Markdown { page, url, selector }.execute(sink)?;
                writeln!(io::stdout())?;
                Ok(())
            } else {
                output::Markdown { page, url, selector }.execute(sink)
            }
        }
        Format::Html | Format::Text | Format::Png => {
            unreachable!("guarded by validate_args before batch dispatch")
        }
    }
}

fn dispatch_output(args: &FetchArgs, page: &Page, url: &str, sink: Sink<'_>) -> Result<()> {
    if let Some(result) = page.js_result.as_deref() {
        return output::js_eval(url, result, sink);
    }
    if args.schema.is_some() {
        return output::Extracted { page, url }.execute(sink);
    }
    let selector = args.selector.as_deref();
    match args.format {
        Format::Markdown => output::Markdown { page, url, selector }.execute(sink),
        Format::Json => output::Json { page, url, selector }.execute(sink),
        Format::Html => output::raw(url, output::Ext::Html, &page.html, sink),
        Format::Text => output::raw(url, output::Ext::Text, &page.inner_text, sink),
        Format::Png => output::Screenshot { page, sink }.execute(),
    }
}

fn build_fetch_options(args: &FetchArgs, url: &str) -> Result<FetchOptions> {
    let base = if args.format == Format::Png {
        FetchOptions::screenshot(url, args.full_page)
    } else if let Some(expr) = args.js.as_deref() {
        FetchOptions::javascript(url, expr)
    } else {
        FetchOptions::new(url)
    };
    let opts = base
        .timeout(Duration::from_secs(args.timeout))
        .settle(Duration::from_millis(args.settle))
        .visibility(args.visibility.to_policy());
    let opts = match args.user_agent {
        Some(ref ua) => opts.user_agent(ua),
        None => opts,
    };
    let opts = match args.schema {
        Some(ref path) => opts.schema(load_schema(path)?),
        None => opts,
    };
    Ok(opts)
}

fn load_schema(path: &Path) -> Result<servo_fetch::schema::ExtractSchema> {
    servo_fetch::schema::ExtractSchema::from_path(path).map_err(|e| anyhow::anyhow!("schema '{}': {e}", path.display()))
}