Skip to main content

edgeparse_core/pdf/
loader.rs

1//! PDF document loading via lopdf.
2
3use std::path::Path;
4
5use lopdf::Document;
6
7use crate::EdgePdfError;
8
9/// Raw loaded PDF document with page data.
10pub struct RawPdfDocument {
11    /// The lopdf Document handle
12    pub document: Document,
13    /// Number of pages
14    pub num_pages: u32,
15    /// Document metadata
16    pub metadata: PdfMetadata,
17}
18
19impl std::fmt::Debug for RawPdfDocument {
20    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
21        f.debug_struct("RawPdfDocument")
22            .field("num_pages", &self.num_pages)
23            .field("metadata", &self.metadata)
24            .finish_non_exhaustive()
25    }
26}
27
28/// Extracted PDF metadata.
29#[derive(Debug, Clone, Default)]
30pub struct PdfMetadata {
31    /// Author
32    pub author: Option<String>,
33    /// Title
34    pub title: Option<String>,
35    /// Creation date
36    pub creation_date: Option<String>,
37    /// Modification date
38    pub modification_date: Option<String>,
39}
40
41/// Load a PDF file and extract basic structure.
42///
43/// # Arguments
44/// * `path` - Path to the PDF file
45/// * `password` - Optional decryption password
46///
47/// # Errors
48/// Returns `EdgePdfError::LoadError` if the file cannot be read or parsed.
49pub fn load_pdf(path: &Path, _password: Option<&str>) -> Result<RawPdfDocument, EdgePdfError> {
50    if !path.exists() {
51        return Err(EdgePdfError::LoadError(format!(
52            "File not found: {}",
53            path.display()
54        )));
55    }
56
57    let document = Document::load(path).map_err(|e| {
58        EdgePdfError::LoadError(format!("Failed to load PDF {}: {}", path.display(), e))
59    })?;
60
61    let pages = document.get_pages();
62    let num_pages = pages.len() as u32;
63
64    let metadata = extract_metadata(&document);
65
66    Ok(RawPdfDocument {
67        document,
68        num_pages,
69        metadata,
70    })
71}
72
73/// Extract metadata from the PDF document info dictionary.
74fn extract_metadata(doc: &Document) -> PdfMetadata {
75    let mut metadata = PdfMetadata::default();
76
77    if let Ok(info_ref) = doc.trailer.get(b"Info") {
78        if let Ok(info_ref) = info_ref.as_reference() {
79            if let Ok(info) = doc.get_object(info_ref) {
80                if let Ok(dict) = info.as_dict() {
81                    metadata.author = extract_string_field(dict, b"Author");
82                    metadata.title = extract_string_field(dict, b"Title");
83                    metadata.creation_date = extract_string_field(dict, b"CreationDate");
84                    metadata.modification_date = extract_string_field(dict, b"ModDate");
85                }
86            }
87        }
88    }
89
90    metadata
91}
92
93/// Load a PDF from an in-memory byte slice.
94///
95/// Uses lopdf's `Document::load_mem()` which parses PDF from `&[u8]`.
96/// This is the WASM-compatible loader — no filesystem access required.
97///
98/// # Arguments
99/// * `data` — raw PDF bytes
100/// * `_password` — optional decryption password (not yet implemented)
101///
102/// # Errors
103/// Returns `EdgePdfError::LoadError` if the bytes cannot be parsed as PDF.
104pub fn load_pdf_from_bytes(
105    data: &[u8],
106    _password: Option<&str>,
107) -> Result<RawPdfDocument, EdgePdfError> {
108    if data.is_empty() {
109        return Err(EdgePdfError::LoadError("Empty PDF data".to_string()));
110    }
111
112    let document = Document::load_mem(data)
113        .map_err(|e| EdgePdfError::LoadError(format!("Failed to parse PDF from bytes: {e}")))?;
114
115    let pages = document.get_pages();
116    let num_pages = pages.len() as u32;
117    let metadata = extract_metadata(&document);
118
119    Ok(RawPdfDocument {
120        document,
121        num_pages,
122        metadata,
123    })
124}
125
126/// Extract a string field from a PDF dictionary.
127fn extract_string_field(dict: &lopdf::Dictionary, key: &[u8]) -> Option<String> {
128    dict.get(key).ok().and_then(|obj| match obj {
129        lopdf::Object::String(bytes, _) => String::from_utf8(bytes.clone()).ok(),
130        _ => None,
131    })
132}
133
134#[cfg(test)]
135mod tests {
136    use super::*;
137
138    #[test]
139    fn test_load_nonexistent_file() {
140        let result = load_pdf(Path::new("/nonexistent/file.pdf"), None);
141        assert!(result.is_err());
142        match result.unwrap_err() {
143            EdgePdfError::LoadError(msg) => assert!(msg.contains("File not found")),
144            other => panic!("Unexpected error: {:?}", other),
145        }
146    }
147}