paperdown 0.2.0

A fast CLI tool to batch convert PDFs into Markdown using GLM-OCR.
Documentation
mod cli;

use anyhow::Result;
use clap::Parser;
use futures::stream::{self, StreamExt};
use indicatif::{MultiProgress, ProgressBar, ProgressDrawTarget, ProgressStyle};
use paperdown::core::{self, PdfSummary, ProgressCallback, ProgressEvent, collect_pdfs};
use std::io::IsTerminal;
use std::path::Path;
use std::sync::Arc;
use std::time::Duration;
use tokio::sync::Semaphore;

#[tokio::main]
async fn main() {
    let code = match run().await {
        Ok(code) => code,
        Err(err) => {
            eprintln!("error: {}", format_error_for_stderr(&err.to_string()));
            1
        }
    };
    std::process::exit(code);
}

async fn run() -> Result<i32> {
    let args = cli::Cli::parse();
    let pdfs = collect_pdfs(&args.input)?;
    let progress = if stderr_is_tty() {
        Some(Arc::new(MultiProgress::with_draw_target(
            ProgressDrawTarget::stderr(),
        )))
    } else {
        None
    };

    if pdfs.len() == 1 {
        if args.verbose {
            eprintln!("Processing 1 PDF: {}", pdfs[0].display());
        }
        let summary = core::process_pdf(
            &pdfs[0],
            &args.output,
            &args.env_file,
            Duration::from_secs(args.timeout),
            args.max_download_bytes,
            args.overwrite,
            progress_callback(&pdfs[0], progress.clone()),
        )
        .await?;
        print_single_summary_stdout(&summary);
        return Ok(0);
    }

    let workers = args.workers.min(pdfs.len()).max(1);
    eprintln!("Processing {} PDFs with {} workers...", pdfs.len(), workers);

    let semaphore = Arc::new(Semaphore::new(workers));
    let results = stream::iter(pdfs.into_iter().map(|pdf| {
        let permit_pool = semaphore.clone();
        let output = args.output.clone();
        let env_file = args.env_file.clone();
        let timeout = Duration::from_secs(args.timeout);
        let max_download_bytes = args.max_download_bytes;
        let overwrite = args.overwrite;
        let progress = progress.clone();
        async move {
            let _permit = permit_pool.acquire_owned().await.expect("semaphore");
            let res = core::process_pdf(
                &pdf,
                &output,
                &env_file,
                timeout,
                max_download_bytes,
                overwrite,
                progress_callback(&pdf, progress),
            )
            .await;
            (pdf, res)
        }
    }))
    .buffer_unordered(workers)
    .collect::<Vec<_>>()
    .await;

    let mut failed_count = 0usize;
    let mut success_count = 0usize;
    let mut downloaded_figures = 0usize;
    for (pdf, result) in results {
        match result {
            Ok(summary) => {
                success_count += 1;
                downloaded_figures += summary.downloaded_figures;
                if args.verbose {
                    eprintln!("  done: {}", pdf.display());
                }
            }
            Err(err) => {
                let rendered = format_error_for_stderr(&err.to_string());
                eprintln!("  failed: {}: {rendered}", pdf.display());
                failed_count += 1;
            }
        }
    }

    print_batch_summary_stdout(success_count, failed_count, downloaded_figures);
    Ok(if failed_count > 0 { 1 } else { 0 })
}

fn stderr_is_tty() -> bool {
    std::io::stderr().is_terminal()
}

fn format_error_for_stderr(message: &str) -> String {
    if stderr_is_tty() {
        return message.replace("--overwrite", "\x1b[1;33m--overwrite\x1b[0m");
    }
    message.to_string()
}

fn stdout_is_tty() -> bool {
    std::io::stdout().is_terminal()
}

fn print_single_summary_stdout(summary: &PdfSummary) {
    if stdout_is_tty() {
        println!(
            "\x1b[1;32mDone\x1b[0m {}",
            display_path(Path::new(&summary.pdf))
        );
        println!(
            "\x1b[36m->\x1b[0m markdown: {}",
            display_path(Path::new(&summary.markdown_path))
        );
        println!(
            "\x1b[36m->\x1b[0m downloaded figures: \x1b[1m{}\x1b[0m",
            summary.downloaded_figures
        );
    } else {
        println!(
            "Done {} | markdown: {} | downloaded figures: {}",
            display_path(Path::new(&summary.pdf)),
            display_path(Path::new(&summary.markdown_path)),
            summary.downloaded_figures
        );
    }
}

fn print_batch_summary_stdout(processed: usize, failed: usize, figures: usize) {
    if stdout_is_tty() {
        let color = if failed == 0 {
            "\x1b[1;32m"
        } else {
            "\x1b[1;33m"
        };
        println!(
            "{color}Batch Complete\x1b[0m processed: \x1b[1m{processed}\x1b[0m failed: \x1b[1m{failed}\x1b[0m figures: \x1b[1m{figures}\x1b[0m"
        );
    } else {
        println!("Batch Complete processed: {processed} failed: {failed} figures: {figures}");
    }
}

fn progress_callback(pdf: &Path, multi: Option<Arc<MultiProgress>>) -> Option<ProgressCallback> {
    let multi = multi?;
    let label = display_path(pdf);
    let spinner = multi.add(ProgressBar::new_spinner());
    spinner.set_style(
        ProgressStyle::with_template("{spinner:.green} {msg}")
            .expect("spinner template")
            .tick_chars("-\\|/ "),
    );
    spinner.set_message(format!("{label} OCR"));
    spinner.enable_steady_tick(Duration::from_millis(90));

    let markdown_pb = multi.add(ProgressBar::new(1));
    markdown_pb.set_style(
        ProgressStyle::with_template("{bar:20.cyan/blue} {bytes}/{total_bytes} {msg}")
            .expect("markdown template"),
    );
    markdown_pb.set_message(format!("{label} markdown"));
    markdown_pb.finish_and_clear();

    let figures_pb = multi.add(ProgressBar::new(1));
    figures_pb.set_style(
        ProgressStyle::with_template("{bar:20.green/blue} {pos}/{len} {msg}")
            .expect("figure template"),
    );
    figures_pb.set_message(format!("{label} figures"));
    figures_pb.finish_and_clear();

    let cb = move |event: ProgressEvent| match event {
        ProgressEvent::OcrStarted => {
            spinner.enable_steady_tick(Duration::from_millis(90));
        }
        ProgressEvent::OcrFinished => {
            spinner.finish_with_message(format!("{label} OCR done"));
        }
        ProgressEvent::MarkdownWriteStarted { bytes } => {
            markdown_pb.reset();
            markdown_pb.set_length(bytes as u64);
            markdown_pb.set_position(0);
        }
        ProgressEvent::MarkdownWriteFinished => {
            let len = markdown_pb.length().unwrap_or(1);
            markdown_pb.set_position(len);
            markdown_pb.finish_with_message(format!("{label} markdown written"));
        }
        ProgressEvent::FigureScanStarted { total } => {
            figures_pb.reset();
            figures_pb.set_length(total as u64);
            figures_pb.set_position(0);
        }
        ProgressEvent::FigureDownloadFinished => {
            figures_pb.inc(1);
            if figures_pb.position() >= figures_pb.length().unwrap_or(0) {
                figures_pb.finish_with_message(format!("{label} figures downloaded"));
            }
        }
    };

    Some(Arc::new(cb))
}

fn display_path(path: &Path) -> String {
    if let Ok(cwd) = std::env::current_dir()
        && let Ok(rel) = path.strip_prefix(cwd)
    {
        return rel.display().to_string();
    }
    path.file_name()
        .map(|n| n.to_string_lossy().into_owned())
        .unwrap_or_else(|| path.display().to_string())
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::path::PathBuf;
    use std::sync::Arc;

    #[test]
    fn display_path_uses_relative_path_when_possible() {
        let cwd = std::env::current_dir().expect("cwd");
        let abs = cwd.join("pdf").join("paper.pdf");
        assert_eq!(
            display_path(&abs),
            PathBuf::from("pdf").join("paper.pdf").display().to_string()
        );
    }

    #[test]
    fn display_path_falls_back_to_file_name() {
        let path = PathBuf::from("/tmp/example.pdf");
        assert_eq!(display_path(&path), "example.pdf");
    }

    #[test]
    fn format_error_for_stderr_rewrites_overwrite_token_when_tty() {
        let message = "Re-run with --overwrite";
        let rendered = format_error_for_stderr(message);
        if stderr_is_tty() {
            assert!(rendered.contains("\x1b[1;33m--overwrite\x1b[0m"));
        } else {
            assert_eq!(rendered, message);
        }
    }

    #[test]
    fn print_helpers_execute_without_panic() {
        let summary = PdfSummary {
            pdf: "/tmp/paper.pdf".to_string(),
            output_dir: "/tmp/out/paper".to_string(),
            markdown_path: "/tmp/out/paper/index.md".to_string(),
            downloaded_figures: 2,
            remote_figure_links: 3,
            image_blocks: 3,
            usage: None,
            log_path: "/tmp/out/paper/log.jsonl".to_string(),
        };
        print_single_summary_stdout(&summary);
        print_batch_summary_stdout(2, 1, 4);
    }

    #[test]
    fn progress_callback_returns_none_without_multi_progress() {
        assert!(progress_callback(Path::new("paper.pdf"), None).is_none());
    }

    #[test]
    fn progress_callback_handles_all_events() {
        let multi = Arc::new(MultiProgress::with_draw_target(ProgressDrawTarget::hidden()));
        let callback = progress_callback(Path::new("paper.pdf"), Some(multi)).unwrap();
        callback(ProgressEvent::OcrStarted);
        callback(ProgressEvent::OcrFinished);
        callback(ProgressEvent::MarkdownWriteStarted { bytes: 16 });
        callback(ProgressEvent::MarkdownWriteFinished);
        callback(ProgressEvent::FigureScanStarted { total: 2 });
        callback(ProgressEvent::FigureDownloadFinished);
        callback(ProgressEvent::FigureDownloadFinished);
    }
}