use clap::{ArgAction, Parser};
use std::path::PathBuf;
#[derive(Debug, Clone, Parser)]
#[command(
name = "paperdown",
version,
about = "Convert academic PDF files into markdown with local figures via Z.AI OCR.",
long_about = "paperdown converts one PDF or a directory of PDFs into markdown output folders.\n\n\
For each PDF, it creates:\n\
- <output>/<pdf_stem>/index.md\n\
- <output>/<pdf_stem>/figures/\n\
- <output>/<pdf_stem>/log.jsonl\n\n\
API key lookup order:\n\
1) ZAI_API_KEY from --env-file\n\
2) ZAI_API_KEY from environment",
after_help = "Examples:\n \
paperdown --input pdf/paper.pdf\n \
paperdown --input pdf/ --output md/ --workers 4\n \
paperdown --input pdf/ --output md/ --overwrite\n\n\
Notes:\n \
Without --overwrite, existing index.md or figures/ causes a failure.\n \
Progress bars are shown on stderr only when running in a TTY."
)]
pub struct Cli {
#[arg(
long,
value_name = "PATH",
required = true,
help = "Input path: a single .pdf file or a directory containing .pdf files."
)]
pub input: PathBuf,
#[arg(
long,
default_value = "md",
help = "Output root directory for generated markdown folders."
)]
pub output: PathBuf,
#[arg(
long = "env-file",
default_value = ".env",
help = "Path to .env file checked first for ZAI_API_KEY, before environment fallback."
)]
pub env_file: PathBuf,
#[arg(
long,
default_value_t = 180u64,
value_parser = clap::value_parser!(u64).range(1..),
help = "HTTP timeout in seconds for OCR requests and figure downloads."
)]
pub timeout: u64,
#[arg(
long = "max-download-bytes",
default_value_t = 20_971_520u64,
value_parser = clap::value_parser!(u64).range(1..),
help = "Maximum allowed size (bytes) for each downloaded figure file."
)]
pub max_download_bytes: u64,
#[arg(
long,
default_value_t = default_workers(),
value_parser = parse_positive_usize,
help = "Maximum number of PDFs processed concurrently in batch mode."
)]
pub workers: usize,
#[arg(
short = 'v',
long,
action = ArgAction::SetTrue,
help = "Enable verbose progress messages on stderr."
)]
pub verbose: bool,
#[arg(
long,
action = ArgAction::SetTrue,
help = "Replace existing managed output artifacts (index.md and figures/)."
)]
pub overwrite: bool,
}
pub fn default_workers() -> usize {
let cpu = std::thread::available_parallelism()
.map(|n| n.get())
.unwrap_or(4);
(cpu * 4).clamp(4, 32)
}
fn parse_positive_usize(value: &str) -> Result<usize, String> {
let parsed = value
.parse::<usize>()
.map_err(|_| format!("invalid integer: {value}"))?;
if parsed == 0 {
return Err("must be greater than 0".to_string());
}
Ok(parsed)
}
#[cfg(test)]
mod tests {
use super::*;
use clap::{CommandFactory, Parser};
#[test]
fn default_workers_formula_bounds() {
let workers = default_workers();
assert!((4..=32).contains(&workers));
}
#[test]
fn parses_defaults() {
let cli = Cli::parse_from(["paperdown", "--input", "in.pdf"]);
assert_eq!(cli.input, PathBuf::from("in.pdf"));
assert_eq!(cli.output, PathBuf::from("md"));
assert_eq!(cli.env_file, PathBuf::from(".env"));
assert_eq!(cli.timeout, 180);
assert_eq!(cli.max_download_bytes, 20_971_520);
assert_eq!(cli.workers, default_workers());
assert!(!cli.verbose);
assert!(!cli.overwrite);
}
#[test]
fn rejects_zero_positive_fields() {
assert!(Cli::try_parse_from(["paperdown", "--input", "in.pdf", "--timeout", "0"]).is_err());
assert!(
Cli::try_parse_from([
"paperdown",
"--input",
"in.pdf",
"--max-download-bytes",
"0"
])
.is_err()
);
assert!(Cli::try_parse_from(["paperdown", "--input", "in.pdf", "--workers", "0"]).is_err());
}
#[test]
fn help_text_contains_examples_and_key_guidance() {
let mut cmd = Cli::command();
let help = cmd.render_long_help().to_string();
assert!(help.contains("Examples:"));
assert!(help.contains("--overwrite"));
let file_first = help.find("1) ZAI_API_KEY from --env-file");
let env_second = help.find("2) ZAI_API_KEY from environment");
assert!(file_first.is_some());
assert!(env_second.is_some());
assert!(file_first.unwrap() < env_second.unwrap());
assert!(help.contains("single .pdf file or a directory"));
}
}