edgeparse_core/pdf/
loader.rs1use std::path::Path;
4
5use lopdf::Document;
6
7use crate::EdgePdfError;
8
9pub struct RawPdfDocument {
11 pub document: Document,
13 pub num_pages: u32,
15 pub metadata: PdfMetadata,
17}
18
19impl std::fmt::Debug for RawPdfDocument {
20 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
21 f.debug_struct("RawPdfDocument")
22 .field("num_pages", &self.num_pages)
23 .field("metadata", &self.metadata)
24 .finish_non_exhaustive()
25 }
26}
27
28#[derive(Debug, Clone, Default)]
30pub struct PdfMetadata {
31 pub author: Option<String>,
33 pub title: Option<String>,
35 pub creation_date: Option<String>,
37 pub modification_date: Option<String>,
39}
40
41pub fn load_pdf(path: &Path, _password: Option<&str>) -> Result<RawPdfDocument, EdgePdfError> {
50 if !path.exists() {
51 return Err(EdgePdfError::LoadError(format!(
52 "File not found: {}",
53 path.display()
54 )));
55 }
56
57 let document = Document::load(path).map_err(|e| {
58 EdgePdfError::LoadError(format!("Failed to load PDF {}: {}", path.display(), e))
59 })?;
60
61 let pages = document.get_pages();
62 let num_pages = pages.len() as u32;
63
64 let metadata = extract_metadata(&document);
65
66 Ok(RawPdfDocument {
67 document,
68 num_pages,
69 metadata,
70 })
71}
72
73fn extract_metadata(doc: &Document) -> PdfMetadata {
75 let mut metadata = PdfMetadata::default();
76
77 if let Ok(info_ref) = doc.trailer.get(b"Info") {
78 if let Ok(info_ref) = info_ref.as_reference() {
79 if let Ok(info) = doc.get_object(info_ref) {
80 if let Ok(dict) = info.as_dict() {
81 metadata.author = extract_string_field(dict, b"Author");
82 metadata.title = extract_string_field(dict, b"Title");
83 metadata.creation_date = extract_string_field(dict, b"CreationDate");
84 metadata.modification_date = extract_string_field(dict, b"ModDate");
85 }
86 }
87 }
88 }
89
90 metadata
91}
92
93pub fn load_pdf_from_bytes(
105 data: &[u8],
106 _password: Option<&str>,
107) -> Result<RawPdfDocument, EdgePdfError> {
108 if data.is_empty() {
109 return Err(EdgePdfError::LoadError("Empty PDF data".to_string()));
110 }
111
112 let document = Document::load_mem(data)
113 .map_err(|e| EdgePdfError::LoadError(format!("Failed to parse PDF from bytes: {e}")))?;
114
115 let pages = document.get_pages();
116 let num_pages = pages.len() as u32;
117 let metadata = extract_metadata(&document);
118
119 Ok(RawPdfDocument {
120 document,
121 num_pages,
122 metadata,
123 })
124}
125
126fn extract_string_field(dict: &lopdf::Dictionary, key: &[u8]) -> Option<String> {
128 dict.get(key).ok().and_then(|obj| match obj {
129 lopdf::Object::String(bytes, _) => String::from_utf8(bytes.clone()).ok(),
130 _ => None,
131 })
132}
133
134#[cfg(test)]
135mod tests {
136 use super::*;
137
138 #[test]
139 fn test_load_nonexistent_file() {
140 let result = load_pdf(Path::new("/nonexistent/file.pdf"), None);
141 assert!(result.is_err());
142 match result.unwrap_err() {
143 EdgePdfError::LoadError(msg) => assert!(msg.contains("File not found")),
144 other => panic!("Unexpected error: {:?}", other),
145 }
146 }
147}