rusty-pdfgrep 0.1.0

Grep through PDF files — a Rust port of Hans-Peter Deifel's `pdfgrep(1)` with lopdf-backed text extraction, regex + fancy-regex pluggable engines, --password retry for encrypted PDFs, GNU-grep-compatible color output, recursive walking with fnmatch include/exclude, and a typed library API.
Documentation
//! PDF reading via `lopdf` (FR-024..FR-026, AD-004, AD-012, HINT-002/003).
//!
//! Per-page text extraction with `std::panic::catch_unwind` around each call —
//! `lopdf` has known panic paths on malformed CID font dictionaries. A panic
//! on page N emits a stderr warning and the caller advances to N+1.

use std::panic::AssertUnwindSafe;
use std::path::Path;

use crate::error::PdfGrepError;

/// Loaded PDF handle ready for page-by-page extraction.
#[derive(Debug)]
pub struct PdfDocument {
    doc: lopdf::Document,
    page_numbers: Vec<u32>,
}

impl PdfDocument {
    /// Load a PDF from a path, attempting each `passwords` entry in order if
    /// the file is encrypted. Returns `PdfGrepError::Encrypted` when no
    /// password succeeds.
    ///
    /// # Errors
    ///
    /// - `PdfGrepError::Io` on file-open failure.
    /// - `PdfGrepError::Pdf` on `lopdf` parser failure.
    /// - `PdfGrepError::Encrypted` on encrypted PDF with no successful password.
    pub fn open(path: &Path, passwords: &[String]) -> Result<Self, PdfGrepError> {
        let mut doc = lopdf::Document::load(path).map_err(|e| match e {
            lopdf::Error::IO(io) => PdfGrepError::Io {
                path: path.to_path_buf(),
                source: io,
            },
            other => PdfGrepError::Pdf {
                path: path.to_path_buf(),
                message: other.to_string(),
            },
        })?;

        if doc.is_encrypted() {
            let mut authenticated = false;
            // Always try empty password first (matches upstream's silent
            // auto-decrypt of empty-string-encrypted PDFs).
            if doc.decrypt("").is_ok() {
                authenticated = true;
            } else {
                for pwd in passwords {
                    if doc.decrypt(pwd).is_ok() {
                        authenticated = true;
                        break;
                    }
                }
            }
            if !authenticated {
                return Err(PdfGrepError::Encrypted {
                    path: path.to_path_buf(),
                });
            }
        }

        let page_numbers: Vec<u32> = doc.get_pages().keys().copied().collect();
        let mut page_numbers = page_numbers;
        page_numbers.sort_unstable();

        Ok(PdfDocument { doc, page_numbers })
    }

    /// Load a PDF from bytes (for stdin handling).
    ///
    /// # Errors
    ///
    /// - `PdfGrepError::Pdf` on parser failure.
    /// - `PdfGrepError::Encrypted` if encrypted and no password works.
    pub fn from_bytes(bytes: &[u8], passwords: &[String]) -> Result<Self, PdfGrepError> {
        let virtual_path = Path::new("<stdin>");
        let mut doc = lopdf::Document::load_mem(bytes).map_err(|e| PdfGrepError::Pdf {
            path: virtual_path.to_path_buf(),
            message: e.to_string(),
        })?;

        if doc.is_encrypted() {
            let mut authenticated = false;
            if doc.decrypt("").is_ok() {
                authenticated = true;
            } else {
                for pwd in passwords {
                    if doc.decrypt(pwd).is_ok() {
                        authenticated = true;
                        break;
                    }
                }
            }
            if !authenticated {
                return Err(PdfGrepError::Encrypted {
                    path: virtual_path.to_path_buf(),
                });
            }
        }

        let mut page_numbers: Vec<u32> = doc.get_pages().keys().copied().collect();
        page_numbers.sort_unstable();
        Ok(PdfDocument { doc, page_numbers })
    }

    /// Sorted list of 1-indexed page numbers in this PDF.
    #[must_use]
    pub fn page_numbers(&self) -> &[u32] {
        &self.page_numbers
    }

    /// Extract text from a single page, catching `lopdf` parser panics.
    /// Returns `Ok(text)` on success, `Err(message)` on parser error or panic
    /// — the caller decides whether to warn or propagate.
    pub fn extract_page(&self, page_num: u32) -> Result<String, String> {
        let doc = &self.doc;
        let result = std::panic::catch_unwind(AssertUnwindSafe(|| doc.extract_text(&[page_num])));
        match result {
            Ok(Ok(text)) => Ok(text),
            Ok(Err(e)) => Err(format!("page {page_num}: {e}")),
            Err(_) => Err(format!("page {page_num}: parser panic")),
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn non_existent_path_returns_io_error() {
        let err = PdfDocument::open(Path::new("/definitely/not/a/real/pdf.pdf"), &[]).unwrap_err();
        assert!(matches!(
            err,
            PdfGrepError::Io { .. } | PdfGrepError::Pdf { .. }
        ));
    }

    #[test]
    fn invalid_bytes_return_pdf_error() {
        let err = PdfDocument::from_bytes(b"NOT A PDF", &[]).unwrap_err();
        assert!(matches!(err, PdfGrepError::Pdf { .. }));
    }
}