Skip to main content

edgeparse_core/pdf/
loader.rs

1//! PDF document loading via lopdf.
2
3use std::path::Path;
4
5use lopdf::Document;
6
7use crate::EdgePdfError;
8
9/// Raw loaded PDF document with page data.
10pub struct RawPdfDocument {
11    /// The lopdf Document handle
12    pub document: Document,
13    /// Number of pages
14    pub num_pages: u32,
15    /// Document metadata
16    pub metadata: PdfMetadata,
17}
18
19impl std::fmt::Debug for RawPdfDocument {
20    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
21        f.debug_struct("RawPdfDocument")
22            .field("num_pages", &self.num_pages)
23            .field("metadata", &self.metadata)
24            .finish_non_exhaustive()
25    }
26}
27
28/// Extracted PDF metadata.
29#[derive(Debug, Clone, Default)]
30pub struct PdfMetadata {
31    /// Author
32    pub author: Option<String>,
33    /// Title
34    pub title: Option<String>,
35    /// Creation date
36    pub creation_date: Option<String>,
37    /// Modification date
38    pub modification_date: Option<String>,
39}
40
41/// Load a PDF file and extract basic structure.
42///
43/// # Arguments
44/// * `path` - Path to the PDF file
45/// * `password` - Optional decryption password
46///
47/// # Errors
48/// Returns `EdgePdfError::LoadError` if the file cannot be read or parsed.
49pub fn load_pdf(path: &Path, _password: Option<&str>) -> Result<RawPdfDocument, EdgePdfError> {
50    if !path.exists() {
51        return Err(EdgePdfError::LoadError(format!(
52            "File not found: {}",
53            path.display()
54        )));
55    }
56
57    let document = Document::load(path).map_err(|e| {
58        EdgePdfError::LoadError(format!("Failed to load PDF {}: {}", path.display(), e))
59    })?;
60
61    let pages = document.get_pages();
62    let num_pages = pages.len() as u32;
63
64    let metadata = extract_metadata(&document);
65
66    Ok(RawPdfDocument {
67        document,
68        num_pages,
69        metadata,
70    })
71}
72
73/// Extract metadata from the PDF document info dictionary.
74fn extract_metadata(doc: &Document) -> PdfMetadata {
75    let mut metadata = PdfMetadata::default();
76
77    if let Ok(info_ref) = doc.trailer.get(b"Info") {
78        if let Ok(info_ref) = info_ref.as_reference() {
79            if let Ok(info) = doc.get_object(info_ref) {
80                if let Ok(dict) = info.as_dict() {
81                    metadata.author = extract_string_field(dict, b"Author");
82                    metadata.title = extract_string_field(dict, b"Title");
83                    metadata.creation_date = extract_string_field(dict, b"CreationDate");
84                    metadata.modification_date = extract_string_field(dict, b"ModDate");
85                }
86            }
87        }
88    }
89
90    metadata
91}
92
93/// Extract a string field from a PDF dictionary.
94fn extract_string_field(dict: &lopdf::Dictionary, key: &[u8]) -> Option<String> {
95    dict.get(key).ok().and_then(|obj| match obj {
96        lopdf::Object::String(bytes, _) => String::from_utf8(bytes.clone()).ok(),
97        _ => None,
98    })
99}
100
101#[cfg(test)]
102mod tests {
103    use super::*;
104
105    #[test]
106    fn test_load_nonexistent_file() {
107        let result = load_pdf(Path::new("/nonexistent/file.pdf"), None);
108        assert!(result.is_err());
109        match result.unwrap_err() {
110            EdgePdfError::LoadError(msg) => assert!(msg.contains("File not found")),
111            other => panic!("Unexpected error: {:?}", other),
112        }
113    }
114}