use clap::{Args, Parser, Subcommand};
use liteparse::config::{LiteParseConfig, OutputFormat};
use liteparse::conversion;
use liteparse::extract;
use liteparse::output::{json, text};
use liteparse::parser::LiteParse;
use liteparse::render;
#[derive(Parser, Debug)]
#[command(
name = "lit",
version,
about = "OSS document parsing tool (supports PDF, DOCX, XLSX, images, and more)"
)]
struct Cli {
#[command(subcommand)]
command: Commands,
}
#[derive(Subcommand, Debug)]
enum Commands {
Parse(ParseCommand),
Screenshot(ScreenshotCommand),
BatchParse(BatchParseCommand),
#[command(hide = true)]
Extract(ExtractCommand),
#[command(hide = true)]
ImageBounds(ExtractCommand),
}
#[derive(Args, Debug)]
struct ParseCommand {
file: String,
#[arg(short, long)]
output: Option<String>,
#[arg(long, default_value = "text")]
format: String,
#[arg(long)]
no_ocr: bool,
#[arg(long, default_value = "eng")]
ocr_language: String,
#[arg(long, default_value = None)]
ocr_server_url: Option<String>,
#[arg(long = "ocr-server-header", value_parser = parse_header)]
ocr_server_headers: Vec<(String, String)>,
#[arg(long)]
tessdata_path: Option<String>,
#[arg(long, default_value = "1000")]
max_pages: usize,
#[arg(long)]
target_pages: Option<String>,
#[arg(long, default_value = "150")]
dpi: f32,
#[arg(long)]
preserve_small_text: bool,
#[arg(long)]
password: Option<String>,
#[arg(short, long)]
quiet: bool,
#[arg(long)]
num_workers: Option<usize>,
#[arg(long, default_value = "placeholder")]
image_mode: String,
#[arg(long)]
image_output_dir: Option<String>,
#[arg(long)]
no_links: bool,
}
#[derive(Args, Debug)]
struct ScreenshotCommand {
file: String,
#[arg(short, long, default_value = "./screenshots")]
output_dir: String,
#[arg(long)]
target_pages: Option<String>,
#[arg(long, default_value = "150")]
dpi: f32,
#[arg(long)]
password: Option<String>,
#[arg(short, long)]
quiet: bool,
}
#[derive(Args, Debug)]
struct BatchParseCommand {
input_dir: String,
output_dir: String,
#[arg(long, default_value = "text")]
format: String,
#[arg(long)]
no_ocr: bool,
#[arg(long, default_value = "eng")]
ocr_language: String,
#[arg(long, default_value = None)]
ocr_server_url: Option<String>,
#[arg(long = "ocr-server-header", value_parser = parse_header)]
ocr_server_headers: Vec<(String, String)>,
#[arg(long)]
tessdata_path: Option<String>,
#[arg(long, default_value = "1000")]
max_pages: usize,
#[arg(long, default_value = "150")]
dpi: f32,
#[arg(long)]
recursive: bool,
#[arg(long)]
extension: Option<String>,
#[arg(long)]
password: Option<String>,
#[arg(short, long)]
quiet: bool,
#[arg(long)]
num_workers: Option<usize>,
}
#[derive(Args, Debug)]
struct ExtractCommand {
#[arg(long)]
pdf_path: String,
#[arg(long)]
page_num: Option<u32>,
}
fn parse_output_format(s: &str) -> Result<OutputFormat, String> {
match s.to_lowercase().as_str() {
"json" => Ok(OutputFormat::Json),
"text" => Ok(OutputFormat::Text),
"markdown" | "md" => Ok(OutputFormat::Markdown),
_ => Err(format!(
"unknown format '{}', expected 'json', 'text', or 'markdown'",
s
)),
}
}
fn parse_header(s: &str) -> Result<(String, String), String> {
let (name, value) = s
.split_once(':')
.ok_or_else(|| format!("invalid header '{}', expected 'Name: Value'", s))?;
let name = name.trim();
if name.is_empty() {
return Err(format!("invalid header '{}', empty header name", s));
}
Ok((name.to_string(), value.trim().to_string()))
}
fn parse_image_mode(s: &str) -> Result<liteparse::config::ImageMode, String> {
use liteparse::config::ImageMode;
match s.to_lowercase().as_str() {
"off" | "none" => Ok(ImageMode::Off),
"placeholder" => Ok(ImageMode::Placeholder),
"embed" => Ok(ImageMode::Embed),
_ => Err(format!(
"unknown image-mode '{}', expected 'off', 'placeholder', or 'embed'",
s
)),
}
}
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
let cli = Cli::parse();
match cli.command {
Commands::Parse(cmd) => {
let format = parse_output_format(&cmd.format)?;
let image_mode = parse_image_mode(&cmd.image_mode)?;
let mut config = LiteParseConfig {
ocr_language: cmd.ocr_language,
ocr_enabled: !cmd.no_ocr,
tessdata_path: cmd.tessdata_path,
max_pages: cmd.max_pages,
target_pages: cmd.target_pages,
dpi: cmd.dpi,
output_format: format,
preserve_very_small_text: cmd.preserve_small_text,
password: cmd.password,
quiet: cmd.quiet,
ocr_server_url: cmd.ocr_server_url,
ocr_server_headers: cmd.ocr_server_headers,
image_mode,
extract_links: !cmd.no_links,
..Default::default()
};
if let Some(n) = cmd.num_workers {
config.num_workers = n;
}
let lp = LiteParse::new(config);
let result = lp.parse(&cmd.file).await?;
let formatted = match lp.config().output_format {
OutputFormat::Json => json::format_json(&result.pages)?,
OutputFormat::Text => text::format_text(&result.pages),
OutputFormat::Markdown => result.text.clone(),
};
if let Some(dir) = cmd.image_output_dir.as_deref()
&& !result.images.is_empty()
{
std::fs::create_dir_all(dir)?;
for img in &result.images {
let path = format!("{}/image_{}.{}", dir, img.id, img.format);
std::fs::write(&path, &img.bytes)?;
}
if !cmd.quiet {
eprintln!(
"[liteparse] wrote {} image(s) to {}",
result.images.len(),
dir
);
}
}
match cmd.output {
Some(path) => {
std::fs::write(&path, &formatted)?;
if !cmd.quiet {
eprintln!("[liteparse] wrote output to {}", path);
}
}
None => {
println!("{}", formatted);
}
}
}
Commands::Screenshot(cmd) => {
let target_pages = cmd
.target_pages
.as_ref()
.map(|s| liteparse::config::parse_target_pages(s))
.transpose()
.map_err(|e| format!("invalid --target-pages: {}", e))?;
std::fs::create_dir_all(&cmd.output_dir)?;
let config = LiteParseConfig {
target_pages: cmd.target_pages.clone(),
dpi: cmd.dpi,
password: cmd.password.clone(),
quiet: cmd.quiet,
..Default::default()
};
let lp = LiteParse::new(config);
let results = lp.screenshot(&cmd.file, target_pages).await?;
for result in results {
let output_path = format!("{}/page_{}.png", cmd.output_dir, result.page_num);
std::fs::write(&output_path, &result.image_bytes)?;
if !cmd.quiet {
eprintln!(
"[liteparse] screenshot page {} → {}",
result.page_num, output_path
);
}
}
}
Commands::BatchParse(cmd) => {
let format = parse_output_format(&cmd.format)?;
let ext_filter = cmd.extension.as_ref().map(|e| {
let e = e.to_lowercase();
if e.starts_with('.') {
e
} else {
format!(".{}", e)
}
});
let mut config = LiteParseConfig {
ocr_language: cmd.ocr_language,
ocr_enabled: !cmd.no_ocr,
tessdata_path: cmd.tessdata_path,
max_pages: cmd.max_pages,
target_pages: None,
dpi: cmd.dpi,
output_format: format.clone(),
preserve_very_small_text: false,
password: cmd.password,
quiet: cmd.quiet,
ocr_server_url: cmd.ocr_server_url,
ocr_server_headers: cmd.ocr_server_headers,
..Default::default()
};
if let Some(n) = cmd.num_workers {
config.num_workers = n;
}
let lp = LiteParse::new(config);
let out_ext = match format {
OutputFormat::Json => "json",
OutputFormat::Markdown => "md",
OutputFormat::Text => "txt",
};
std::fs::create_dir_all(&cmd.output_dir)?;
let files = collect_files(&cmd.input_dir, cmd.recursive, ext_filter.as_deref())?;
if files.is_empty() {
eprintln!("[liteparse] no matching files found in {}", cmd.input_dir);
return Ok(());
}
if !cmd.quiet {
eprintln!("[liteparse] found {} files to process", files.len());
}
let mut success = 0usize;
let mut errors = 0usize;
for file_path in &files {
let t0 = web_time::Instant::now();
let rel = file_path.strip_prefix(&cmd.input_dir).unwrap_or(file_path);
let out_path = std::path::Path::new(&cmd.output_dir)
.join(rel)
.with_extension(out_ext);
if let Some(parent) = out_path.parent() {
std::fs::create_dir_all(parent)?;
}
match lp.parse(file_path).await {
Ok(result) => {
let fmt_result: Result<String, Box<dyn std::error::Error>> =
match lp.config().output_format {
OutputFormat::Json => {
json::format_json(&result.pages).map_err(|e| e.into())
}
OutputFormat::Text => Ok(text::format_text(&result.pages)),
OutputFormat::Markdown => Ok(result.text.clone()),
};
match fmt_result {
Ok(formatted) => {
std::fs::write(&out_path, &formatted)?;
success += 1;
if !cmd.quiet {
let elapsed = t0.elapsed().as_secs_f64() * 1000.0;
eprintln!(
"[liteparse] {} → {} ({:.1}ms)",
file_path,
out_path.display(),
elapsed
);
}
}
Err(e) => {
eprintln!("[liteparse] error formatting {}: {}", file_path, e);
errors += 1;
}
}
}
Err(e) => {
eprintln!("[liteparse] error parsing {}: {}", file_path, e);
errors += 1;
}
}
}
eprintln!(
"[liteparse] batch complete: {} succeeded, {} failed",
success, errors
);
if errors > 0 {
std::process::exit(1);
}
}
Commands::Extract(cmd) => {
extract::extract(&cmd.pdf_path, cmd.page_num)?;
}
Commands::ImageBounds(cmd) => {
render::image_bounds(&cmd.pdf_path, cmd.page_num)?;
}
}
Ok(())
}
fn collect_files(
dir: &str,
recursive: bool,
ext_filter: Option<&str>,
) -> Result<Vec<String>, Box<dyn std::error::Error>> {
let mut files = Vec::new();
collect_files_inner(std::path::Path::new(dir), recursive, ext_filter, &mut files)?;
files.sort();
Ok(files)
}
fn collect_files_inner(
dir: &std::path::Path,
recursive: bool,
ext_filter: Option<&str>,
files: &mut Vec<String>,
) -> Result<(), Box<dyn std::error::Error>> {
for entry in std::fs::read_dir(dir)? {
let entry = entry?;
let path = entry.path();
if path.is_dir() {
if recursive {
collect_files_inner(&path, recursive, ext_filter, files)?;
}
continue;
}
let path_str = path.to_string_lossy().to_string();
if let Some(filter) = ext_filter {
if !path_str.to_lowercase().ends_with(filter) {
continue;
}
} else if !conversion::is_supported_extension(&path_str) {
continue;
}
files.push(path_str);
}
Ok(())
}