edgequake-pdf2md 0.6.0

Convert PDF documents to Markdown using Vision Language Models — CLI and library
//! Output types returned from successful conversions.
//!
//! These types are serialisable so downstream tools can consume JSON output
//! (e.g. `pdf2md --json`) without parsing Markdown. Every field that might
//! be missing (e.g. optional PDF metadata) uses `Option` rather than empty
//! strings so callers can distinguish "not present" from "present but blank".

use crate::error::{PageError, Pdf2MdError};
use serde::{Deserialize, Serialize};

/// The complete result of converting a PDF document to Markdown.
///
/// ## Why pages + markdown?
/// Both representations are returned together because callers often need:
/// - The assembled `markdown` string for writing files or rendering
/// - The individual `pages` list for cost accounting, error reporting, or
///   streaming UIs that display pages as they arrive
///
/// Callers that only need the text can discard `pages` and `stats`.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ConversionOutput {
    /// The full, assembled Markdown document.
    ///
    /// Pages are joined with the configured [`crate::config::PageSeparator`]
    /// and optionally prefixed with YAML front-matter. Post-processing rules
    /// have already been applied to each page before assembly.
    pub markdown: String,

    /// Per-page results, sorted by `page_num`.
    ///
    /// Includes both successful and failed pages so callers can report
    /// which pages need manual review after a partial conversion.
    pub pages: Vec<PageResult>,

    /// PDF metadata (title, author, page count, version, encryption status).
    ///
    /// Extracted without rendering via pdfium before conversion starts,
    /// making it available even if rendering or LLM calls later fail.
    pub metadata: DocumentMetadata,

    /// Aggregate timing, token usage, and page-count statistics.
    ///
    /// Useful for cost estimation, benchmark comparisons, and progress bars.
    pub stats: ConversionStats,
}

impl ConversionOutput {
    /// Number of pages that failed (convenience wrapper around `stats.failed_pages`).
    ///
    /// Returns non-zero when partial conversion occurred. This may be checked
    /// after conversion to alert users about pages that need manual review.
    pub fn failed_pages(&self) -> usize {
        self.stats.failed_pages
    }

    /// Convert this output into a `Result`, returning
    /// [`Pdf2MdError::PartialFailure`] if any pages failed.
    ///
    /// Use this when the caller wants to treat *any* page failure as an error
    /// rather than receiving partial output silently.
    ///
    /// # Example
    /// ```rust,no_run
    /// # use edgequake_pdf2md::{convert, ConversionConfig};
    /// # #[tokio::main]
    /// # async fn main() -> Result<(), Box<dyn std::error::Error>> {
    /// let config = ConversionConfig::default();
    /// let output = convert("doc.pdf", &config).await?;
    /// // Error if any page failed:
    /// let output = output.into_result()?;
    /// println!("{}", output.markdown);
    /// # Ok(())
    /// # }
    /// ```
    pub fn into_result(self) -> Result<Self, Pdf2MdError> {
        let failed = self.stats.failed_pages;
        if failed > 0 {
            Err(Pdf2MdError::PartialFailure {
                success: self.stats.processed_pages,
                failed,
                total: self.stats.total_pages,
            })
        } else {
            Ok(self)
        }
    }
}

/// The result of converting a single page image through the VLM.
///
/// A page can succeed (`error` is `None`) or fail (`error` is `Some`).
/// Failed pages emit a non-fatal [`PageError`] and carry an empty `markdown`
/// string — the overall conversion continues unless every page fails.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PageResult {
    /// 1-indexed page number matching PDF reader conventions.
    pub page_num: usize,

    /// Post-processed Markdown for this page, or `""` on failure.
    pub markdown: String,

    /// Tokens consumed from the prompt (image + system prompt).
    ///
    /// Image tokens dominate: a 1,024 × 1,024 px PNG ≈ 1,000–1,500 tokens
    /// depending on the model's vision tokeniser.
    pub input_tokens: usize,

    /// Tokens generated by the model (i.e. the Markdown text).
    pub output_tokens: usize,

    /// Wall-clock time for this page's full round-trip (render → encode → LLM),
    /// in milliseconds. Reflects actual latency including retries.
    pub duration_ms: u64,

    /// How many retry attempts were needed before success (0 = first try).
    ///
    /// Persistent retries > 0 often indicate rate-limiting or transient
    /// API errors; worth monitoring in production.
    pub retries: u8,

    /// Non-`None` when this page failed after all retries.
    ///
    /// The surrounding `ConversionOutput` still contains results for
    /// other pages; only returning `Err` at the top level when **all** pages fail.
    pub error: Option<PageError>,
}

/// Aggregate statistics for the whole conversion run.
///
/// Used for cost estimation: multiply token counts by the model's per-token
/// price to get total spend. Timing breakdown (render vs. LLM) helps identify
/// whether bottlenecks are in rasterisation or API latency.
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct ConversionStats {
    /// Total pages in the PDF (includes pages not selected for conversion).
    pub total_pages: usize,

    /// Pages that produced Markdown output successfully.
    pub processed_pages: usize,

    /// Pages that exhausted all retries and produced no output.
    pub failed_pages: usize,

    /// Pages in the selection that were skipped before any processing (e.g.
    /// page encoding failed before the LLM call was even attempted).
    pub skipped_pages: usize,

    /// Sum of all `PageResult::input_tokens` across processed pages.
    pub total_input_tokens: u64,

    /// Sum of all `PageResult::output_tokens` across processed pages.
    pub total_output_tokens: u64,

    /// End-to-end wall time from first byte read to last page assembled (ms).
    pub total_duration_ms: u64,

    /// Time spent in pdfium rasterisation only (ms).
    ///
    /// If this dominates, lower DPI or reduce `max_rendered_pixels`.
    pub render_duration_ms: u64,

    /// Time spent waiting for LLM API calls only (ms).
    ///
    /// If this dominates, increase concurrency or switch to a faster model.
    pub llm_duration_ms: u64,
}

/// Document metadata read from the PDF's info dictionary.
///
/// All string fields are `Option` because PDF metadata is entirely optional —
/// many PDFs omit title, author, etc. The `page_count` and `pdf_version`
/// fields are always present after a successful open.
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct DocumentMetadata {
    /// Title from the PDF info dictionary (`/Title`).
    pub title: Option<String>,
    /// Author from the PDF info dictionary (`/Author`).
    pub author: Option<String>,
    /// Subject from the PDF info dictionary (`/Subject`).
    pub subject: Option<String>,
    /// Application that created the source document (`/Creator`).
    pub creator: Option<String>,
    /// PDF generator software (`/Producer`).
    pub producer: Option<String>,
    /// Creation date string as stored in the PDF (`/CreationDate`).
    pub creation_date: Option<String>,
    /// Last-modification date string as stored in the PDF (`/ModDate`).
    pub modification_date: Option<String>,
    /// Total number of pages in the document.
    pub page_count: usize,
    /// PDF specification version (e.g. `"1.7"`, `"2.0"`).
    pub pdf_version: String,
    /// Whether the document requires a password to open.
    pub is_encrypted: bool,
}

#[cfg(test)]
mod tests {
    use super::*;

    fn make_output(failed: usize, processed: usize, total: usize) -> ConversionOutput {
        ConversionOutput {
            markdown: "# Hello".into(),
            pages: vec![],
            metadata: DocumentMetadata::default(),
            stats: ConversionStats {
                total_pages: total,
                processed_pages: processed,
                failed_pages: failed,
                ..Default::default()
            },
        }
    }

    #[test]
    fn failed_pages_matches_stats() {
        let out = make_output(2, 8, 10);
        assert_eq!(out.failed_pages(), 2);
    }

    #[test]
    fn into_result_ok_when_no_failures() {
        let out = make_output(0, 5, 5);
        assert!(out.into_result().is_ok());
    }

    #[test]
    fn into_result_err_on_partial_failure() {
        let out = make_output(1, 9, 10);
        let err = out.into_result().unwrap_err();
        match err {
            Pdf2MdError::PartialFailure {
                success,
                failed,
                total,
            } => {
                assert_eq!((success, failed, total), (9, 1, 10));
            }
            other => panic!("expected PartialFailure, got {other:?}"),
        }
    }
}