fop-pdf-renderer 0.1.2

Pure Rust PDF-to-image renderer for fop
Documentation
//! fop-pdf-renderer — Pure Rust PDF-to-image renderer
//!
//! Renders PDF pages to raster images (PNG/RGBA) without any C dependencies.
//! Designed to work with PDFs generated by fop-render, enabling self-contained
//! testing and verification.
//!
//! # Quick Start
//!
//! ```no_run
//! use fop_pdf_renderer::PdfRenderer;
//!
//! let pdf_data = std::fs::read("output.pdf").unwrap();
//! let renderer = PdfRenderer::from_bytes(&pdf_data).unwrap();
//!
//! println!("Pages: {}", renderer.page_count());
//!
//! // Render page 0 at 150 DPI
//! let image = renderer.render_page(0, 150.0).unwrap();
//!
//! // Save as PNG
//! renderer.save_as_png(0, "output.png", 150.0).unwrap();
//! ```

pub mod content;
pub mod error;
pub mod font;
pub mod glyph;
pub mod graphics;
pub mod image;
pub mod parser;
pub mod rasterizer;
pub mod text;
pub mod text_extract;

pub use error::{PdfRenderError, Result};
pub use rasterizer::RasterPage;

/// High-level PDF renderer
///
/// Parses a PDF document and provides page rendering capabilities.
pub struct PdfRenderer {
    doc: parser::PdfDocument,
}

impl PdfRenderer {
    /// Parse a PDF document from bytes
    pub fn from_bytes(data: &[u8]) -> Result<Self> {
        let doc = parser::PdfDocument::from_bytes(data)?;
        Ok(Self { doc })
    }

    /// Number of pages in the document
    pub fn page_count(&self) -> usize {
        self.doc.page_count()
    }

    /// Render a single page at the given DPI
    ///
    /// Returns a `RasterPage` with RGBA pixels.
    pub fn render_page(&self, page_index: usize, dpi: f32) -> Result<RasterPage> {
        let page = self.doc.get_page(page_index)?;
        let mut rasterizer = rasterizer::PageRasterizer::new(&self.doc);
        rasterizer.render(&page, dpi)
    }

    /// Render a page and save it as a PNG file
    pub fn save_as_png(&self, page_index: usize, path: &str, dpi: f32) -> Result<()> {
        let page = self.render_page(page_index, dpi)?;
        page.save_png(path)
    }

    /// Render all pages and return them as PNG bytes
    pub fn render_all_pages(&self, dpi: f32) -> Result<Vec<Vec<u8>>> {
        let mut pages = Vec::new();
        for i in 0..self.page_count() {
            let page = self.render_page(i, dpi)?;
            pages.push(page.to_png()?);
        }
        Ok(pages)
    }

    /// Extract user-visible text from a single page.
    ///
    /// Best-effort: PDFs with obfuscated encodings or missing ToUnicode CMaps
    /// may return '?' placeholders for unmapped glyphs.
    pub fn extract_text(&self, page_index: usize) -> Result<String> {
        let page = self.doc.get_page(page_index)?;
        let mut extractor = text_extract::TextExtractor::new(&self.doc);
        extractor.extract_page(&page)
    }

    /// Extract user-visible text from all pages, joined with `"\n\n"`.
    pub fn extract_all_text(&self) -> Result<String> {
        let mut out = String::new();
        for i in 0..self.page_count() {
            if i > 0 {
                out.push_str("\n\n");
            }
            out.push_str(&self.extract_text(i)?);
        }
        Ok(out)
    }

    /// Extract the XMP metadata packet from the PDF catalog `/Metadata` stream.
    ///
    /// Returns the raw UTF-8 string of the XMP packet (including any
    /// `<?xpacket ...?>` processing instructions), or `None` if the document
    /// does not have an embedded XMP stream or the stream bytes are not valid
    /// UTF-8.
    pub fn extract_xmp_metadata(&self) -> Option<String> {
        let bytes = self.doc.get_metadata_stream()?;
        String::from_utf8(bytes).ok()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_empty_pdf_returns_error() {
        let result = PdfRenderer::from_bytes(b"not a pdf");
        assert!(result.is_err());
    }

    #[test]
    fn test_minimal_pdf() {
        // A minimal valid PDF structure
        let pdf = b"%PDF-1.4\n\
            1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n\
            2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n\
            3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >>\nendobj\n\
            xref\n0 4\n0000000000 65535 f \n0000000009 00000 n \n0000000058 00000 n \n0000000115 00000 n \n\
            trailer\n<< /Size 4 /Root 1 0 R >>\nstartxref\n200\n%%EOF\n";

        let renderer = PdfRenderer::from_bytes(pdf);
        // May succeed or fail depending on xref accuracy, but should not panic
        if let Ok(r) = renderer {
            assert_eq!(r.page_count(), 1);
        }
    }

    /// Build a minimal hand-crafted PDF with WinAnsi text `(Hello World)Tj`
    /// and a standard Type1 font resource.  Used to verify `extract_text`.
    fn build_text_pdf(text: &str) -> Vec<u8> {
        let stream_content = format!("BT /F1 12 Tf 72 700 Td ({}) Tj ET", text);
        let content = stream_content.as_bytes();
        let mut out: Vec<u8> = Vec::new();
        out.extend_from_slice(b"%PDF-1.4\n");

        // Obj 1: Catalog
        let o1 = out.len();
        out.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");

        // Obj 2: Pages
        let o2 = out.len();
        out.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n");

        // Obj 4: Content stream
        let o4 = out.len();
        out.extend_from_slice(
            format!("4 0 obj\n<< /Length {} >>\nstream\n", content.len()).as_bytes(),
        );
        out.extend_from_slice(content);
        out.extend_from_slice(b"\nendstream\nendobj\n");

        // Obj 3: Page with Helvetica font resource
        let o3 = out.len();
        out.extend_from_slice(
            b"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R \
              /Resources << /Font << /F1 << /Type /Font /Subtype /Type1 /BaseFont /Helvetica >> >> >> >>\nendobj\n",
        );

        let xref_pos = out.len();
        out.extend_from_slice(b"xref\n0 5\n");
        out.extend_from_slice(b"0000000000 65535 f \n");
        out.extend_from_slice(format!("{:010} 00000 n \n", o1).as_bytes());
        out.extend_from_slice(format!("{:010} 00000 n \n", o2).as_bytes());
        out.extend_from_slice(format!("{:010} 00000 n \n", o3).as_bytes());
        out.extend_from_slice(format!("{:010} 00000 n \n", o4).as_bytes());
        out.extend_from_slice(b"trailer\n<< /Size 5 /Root 1 0 R >>\n");
        out.extend_from_slice(b"startxref\n");
        out.extend_from_slice(format!("{}\n", xref_pos).as_bytes());
        out.extend_from_slice(b"%%EOF\n");
        out
    }

    #[test]
    fn test_extract_text_round_trips_through_fop() {
        let pdf_bytes = build_text_pdf("Hello World");
        let renderer = PdfRenderer::from_bytes(&pdf_bytes).expect("minimal text PDF should parse");
        assert_eq!(renderer.page_count(), 1);
        let text = renderer
            .extract_text(0)
            .expect("extract_text should succeed");
        assert!(
            text.contains("Hello"),
            "extracted text should contain 'Hello', got {:?}",
            text
        );
        assert!(
            text.contains("World"),
            "extracted text should contain 'World', got {:?}",
            text
        );
    }

    #[test]
    fn test_extract_all_text_single_page() {
        let pdf_bytes = build_text_pdf("FooBar");
        let renderer = PdfRenderer::from_bytes(&pdf_bytes).expect("minimal text PDF should parse");
        let text = renderer
            .extract_all_text()
            .expect("extract_all_text should succeed");
        assert!(
            text.contains("FooBar"),
            "extract_all_text should contain 'FooBar', got {:?}",
            text
        );
    }

    #[test]
    fn test_extract_text_out_of_bounds_returns_error() {
        let pdf_bytes = build_text_pdf("x");
        let renderer = PdfRenderer::from_bytes(&pdf_bytes).expect("minimal text PDF should parse");
        assert!(
            renderer.extract_text(999).is_err(),
            "out-of-bounds page index should return error"
        );
    }
}