use std::fs;
use std::io::{self, Write as _};
use std::path::Path;
use std::time::Duration;
use anyhow::{Error, Result, bail};
use servo_fetch::{FetchOptions, Page};
use crate::cli::{FetchArgs, Format};
use crate::output::{self, Sink};
use crate::progress::Progress;
pub(crate) fn run(args: &FetchArgs) -> Result<()> {
validate_args(args)?;
if let Some(dir) = args.output_dir.as_deref() {
fs::create_dir_all(dir)?;
}
if let Some(file) = args.output.as_deref() {
if let Some(parent) = file.parent().filter(|p| !p.as_os_str().is_empty()) {
fs::create_dir_all(parent)?;
}
}
match args.urls.as_slice() {
[] => bail!("URL is required. Run with --help for usage."),
[one] => run_single(args, one),
many => {
let rt = tokio::runtime::Runtime::new()?;
rt.block_on(run_batch(args, many))
}
}
}
pub(crate) fn validate_args(args: &FetchArgs) -> Result<()> {
let raw_format = matches!(args.format, Format::Html | Format::Text);
if raw_format && args.selector.is_some() {
bail!("--selector cannot be used with --format html or text");
}
if args.format == Format::Png {
if args.selector.is_some() {
bail!("--selector cannot be used with --format png");
}
if args.urls.len() > 1 {
bail!("--format png only supports a single URL");
}
if args.output_dir.is_some() {
bail!("--format png cannot be used with --output-dir");
}
}
if args.full_page && args.format != Format::Png {
bail!("--full-page requires --format png");
}
if args.urls.len() > 1 {
if args.output.is_some() {
bail!("-o/--output is only valid with a single URL; use --output-dir for multiple URLs");
}
if args.js.is_some() || raw_format {
bail!("--js, and --format html or text cannot be used with multiple URLs");
}
}
Ok(())
}
fn sink(args: &FetchArgs) -> Sink<'_> {
Sink::from_args(args.output.as_deref(), args.output_dir.as_deref())
}
fn run_single(args: &FetchArgs, url_str: &str) -> Result<()> {
let progress = Progress::new();
progress.ticker(&format!("Fetching {url_str}..."));
let opts = build_fetch_options(args, url_str)?;
let page = servo_fetch::fetch(opts).map_err(Error::from);
progress.clear();
let page = page?;
dispatch_output(args, &page, url_str, sink(args))
}
async fn run_batch(args: &FetchArgs, urls: &[String]) -> Result<()> {
let total = urls.len();
let progress = Progress::new();
progress.header(&format!("Fetching {total} URLs..."));
let schema = args.schema.as_ref().map(|p| load_schema(p)).transpose()?;
let sem = std::sync::Arc::new(tokio::sync::Semaphore::new(4));
let (tx, mut rx) = tokio::sync::mpsc::channel::<(String, std::result::Result<Page, servo_fetch::Error>)>(total);
for url in urls {
let permit = sem.clone().acquire_owned().await?;
let tx = tx.clone();
let url_str = url.clone();
let timeout = args.timeout;
let settle = args.settle;
let user_agent = args.user_agent.clone();
let schema = schema.clone();
let visibility = args.visibility.to_policy();
tokio::task::spawn_blocking(move || {
let mut opts = FetchOptions::new(&url_str)
.timeout(Duration::from_secs(timeout))
.settle(Duration::from_millis(settle))
.visibility(visibility);
if let Some(ua) = user_agent {
opts = opts.user_agent(ua);
}
if let Some(s) = schema {
opts = opts.schema(s);
}
let result = servo_fetch::fetch(opts);
let _ = tx.blocking_send((url_str, result));
drop(permit);
});
}
drop(tx);
let sink = sink(args);
let mut completed = 0usize;
let mut failures = 0usize;
while let Some((url, result)) = rx.recv().await {
completed += 1;
match result {
Ok(page) => {
batch_emit(args, &page, &url, sink)?;
progress.item_done(completed, Some(total), &url, true);
}
Err(err) => {
failures += 1;
tracing::error!(url = %url, "{err:#}");
}
}
}
if failures == total {
bail!("all {total} URLs failed");
}
Ok(())
}
fn batch_emit(args: &FetchArgs, page: &Page, url: &str, sink: Sink<'_>) -> Result<()> {
if args.schema.is_some() {
return output::Extracted { page, url }.execute_compact(sink);
}
let selector = args.selector.as_deref();
match args.format {
Format::Json => output::Json { page, url, selector }.execute_compact(sink),
Format::Markdown => {
if sink.is_stdout() {
writeln!(io::stdout(), "--- {url} ---")?;
output::Markdown { page, url, selector }.execute(sink)?;
writeln!(io::stdout())?;
Ok(())
} else {
output::Markdown { page, url, selector }.execute(sink)
}
}
Format::Html | Format::Text | Format::Png => {
unreachable!("guarded by validate_args before batch dispatch")
}
}
}
fn dispatch_output(args: &FetchArgs, page: &Page, url: &str, sink: Sink<'_>) -> Result<()> {
if let Some(result) = page.js_result.as_deref() {
return output::js_eval(url, result, sink);
}
if args.schema.is_some() {
return output::Extracted { page, url }.execute(sink);
}
let selector = args.selector.as_deref();
match args.format {
Format::Markdown => output::Markdown { page, url, selector }.execute(sink),
Format::Json => output::Json { page, url, selector }.execute(sink),
Format::Html => output::raw(url, output::Ext::Html, &page.html, sink),
Format::Text => output::raw(url, output::Ext::Text, &page.inner_text, sink),
Format::Png => output::Screenshot { page, sink }.execute(),
}
}
fn build_fetch_options(args: &FetchArgs, url: &str) -> Result<FetchOptions> {
let base = if args.format == Format::Png {
FetchOptions::screenshot(url, args.full_page)
} else if let Some(expr) = args.js.as_deref() {
FetchOptions::javascript(url, expr)
} else {
FetchOptions::new(url)
};
let opts = base
.timeout(Duration::from_secs(args.timeout))
.settle(Duration::from_millis(args.settle))
.visibility(args.visibility.to_policy());
let opts = match args.user_agent {
Some(ref ua) => opts.user_agent(ua),
None => opts,
};
let opts = match args.schema {
Some(ref path) => opts.schema(load_schema(path)?),
None => opts,
};
Ok(opts)
}
fn load_schema(path: &Path) -> Result<servo_fetch::schema::ExtractSchema> {
servo_fetch::schema::ExtractSchema::from_path(path).map_err(|e| anyhow::anyhow!("schema '{}': {e}", path.display()))
}