1#![warn(missing_docs)]
8
9pub mod api;
10pub mod models;
11pub mod output;
12pub mod pdf;
13pub mod pipeline;
14pub mod utils;
15
16#[cfg(feature = "hybrid")]
17pub mod hybrid;
18
19pub mod tagged;
20
21use crate::api::config::ProcessingConfig;
22use crate::models::content::ContentElement;
23use crate::models::document::PdfDocument;
24use crate::pdf::chunk_parser::extract_page_chunks;
25use crate::pdf::page_info;
26use crate::pdf::raster_table_ocr::recover_raster_table_borders;
27use crate::pipeline::orchestrator::{run_pipeline, PipelineState};
28use crate::tagged::struct_tree::build_mcid_map;
29
30pub fn convert(
42 input_path: &std::path::Path,
43 config: &ProcessingConfig,
44) -> Result<PdfDocument, EdgePdfError> {
45 let raw_doc = pdf::loader::load_pdf(input_path, config.password.as_deref())?;
46
47 let page_info_list = page_info::extract_page_info(&raw_doc.document);
49
50 let pages_map = raw_doc.document.get_pages();
52 let mut page_contents = Vec::with_capacity(pages_map.len());
53
54 for (&page_num, &page_id) in &pages_map {
55 let page_chunks = extract_page_chunks(&raw_doc.document, page_num, page_id)?;
56 let mut recovered_tables = Vec::new();
57 if let Some(page_info) = page_info_list
58 .iter()
59 .find(|info| info.page_number == page_num)
60 {
61 recovered_tables = recover_raster_table_borders(
62 input_path,
63 &page_info.crop_box,
64 page_num,
65 &page_chunks.text_chunks,
66 &page_chunks.image_chunks,
67 );
68 }
69 let mut elements: Vec<ContentElement> = page_chunks
70 .text_chunks
71 .into_iter()
72 .map(ContentElement::TextChunk)
73 .collect();
74
75 elements.extend(
76 page_chunks
77 .image_chunks
78 .into_iter()
79 .map(ContentElement::Image),
80 );
81 elements.extend(
82 page_chunks
83 .line_chunks
84 .into_iter()
85 .map(ContentElement::Line),
86 );
87 elements.extend(
88 page_chunks
89 .line_art_chunks
90 .into_iter()
91 .map(ContentElement::LineArt),
92 );
93 elements.extend(
94 recovered_tables
95 .into_iter()
96 .map(ContentElement::TableBorder),
97 );
98
99 page_contents.push(elements);
100 }
101
102 let mcid_map = build_mcid_map(&raw_doc.document);
104 let mut pipeline_state = PipelineState::with_mcid_map(page_contents, config.clone(), mcid_map)
105 .with_page_info(page_info_list);
106 run_pipeline(&mut pipeline_state)?;
107
108 let file_name = input_path
110 .file_name()
111 .and_then(|n| n.to_str())
112 .unwrap_or("unknown.pdf")
113 .to_string();
114
115 let mut doc = PdfDocument::new(file_name);
116 doc.number_of_pages = pages_map.len() as u32;
117 doc.author = raw_doc.metadata.author;
118 doc.title = raw_doc.metadata.title;
119 doc.creation_date = raw_doc.metadata.creation_date;
120 doc.modification_date = raw_doc.metadata.modification_date;
121
122 for page in pipeline_state.pages {
124 doc.kids.extend(page);
125 }
126
127 Ok(doc)
128}
129
130#[derive(Debug, thiserror::Error)]
132pub enum EdgePdfError {
133 #[error("PDF loading error: {0}")]
135 LoadError(String),
136
137 #[error("Pipeline error at stage {stage}: {message}")]
139 PipelineError {
140 stage: u32,
142 message: String,
144 },
145
146 #[error("Output error: {0}")]
148 OutputError(String),
149
150 #[error("I/O error: {0}")]
152 IoError(#[from] std::io::Error),
153
154 #[error("Configuration error: {0}")]
156 ConfigError(String),
157
158 #[error("PDF parse error: {0}")]
160 LopdfError(String),
161}
162
163impl From<lopdf::Error> for EdgePdfError {
164 fn from(e: lopdf::Error) -> Self {
165 EdgePdfError::LopdfError(e.to_string())
166 }
167}
168
169#[cfg(test)]
170mod tests {
171 use super::*;
172 use lopdf::{
173 content::{Content, Operation},
174 dictionary, Object, Stream,
175 };
176 use std::io::Write;
177
178 fn create_test_pdf_file(path: &std::path::Path) {
180 let mut doc = lopdf::Document::with_version("1.5");
181 let pages_id = doc.new_object_id();
182
183 let font_id = doc.add_object(dictionary! {
184 "Type" => "Font",
185 "Subtype" => "Type1",
186 "BaseFont" => "Helvetica",
187 });
188
189 let resources_id = doc.add_object(dictionary! {
190 "Font" => dictionary! {
191 "F1" => font_id,
192 },
193 });
194
195 let content = Content {
196 operations: vec![
197 Operation::new("BT", vec![]),
198 Operation::new("Tf", vec!["F1".into(), 12.into()]),
199 Operation::new("Td", vec![72.into(), 700.into()]),
200 Operation::new("Tj", vec![Object::string_literal("Hello EdgeParse!")]),
201 Operation::new("Td", vec![0.into(), Object::Real(-20.0)]),
202 Operation::new("Tj", vec![Object::string_literal("Second line of text.")]),
203 Operation::new("ET", vec![]),
204 ],
205 };
206
207 let encoded = content.encode().unwrap();
208 let content_id = doc.add_object(Stream::new(dictionary! {}, encoded));
209
210 let page_id = doc.add_object(dictionary! {
211 "Type" => "Page",
212 "Parent" => pages_id,
213 "Contents" => content_id,
214 "Resources" => resources_id,
215 "MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
216 });
217
218 let pages = dictionary! {
219 "Type" => "Pages",
220 "Kids" => vec![page_id.into()],
221 "Count" => 1,
222 };
223 doc.objects.insert(pages_id, Object::Dictionary(pages));
224
225 let catalog_id = doc.add_object(dictionary! {
226 "Type" => "Catalog",
227 "Pages" => pages_id,
228 });
229 doc.trailer.set("Root", catalog_id);
230
231 let mut file = std::fs::File::create(path).unwrap();
232 doc.save_to(&mut file).unwrap();
233 file.flush().unwrap();
234 }
235
236 #[test]
237 fn test_convert_end_to_end() {
238 let dir = std::env::temp_dir().join("edgeparse_test");
239 std::fs::create_dir_all(&dir).unwrap();
240 let pdf_path = dir.join("test_convert.pdf");
241
242 create_test_pdf_file(&pdf_path);
243
244 let config = ProcessingConfig::default();
245 let result = convert(&pdf_path, &config);
246 assert!(result.is_ok(), "convert() failed: {:?}", result.err());
247
248 let doc = result.unwrap();
249 assert_eq!(doc.number_of_pages, 1);
250 assert!(
251 !doc.kids.is_empty(),
252 "Expected content elements in document"
253 );
254
255 let mut all_text = String::new();
257 for element in &doc.kids {
258 match element {
259 models::content::ContentElement::TextChunk(tc) => {
260 all_text.push_str(&tc.value);
261 all_text.push(' ');
262 }
263 models::content::ContentElement::TextLine(tl) => {
264 all_text.push_str(&tl.value());
265 all_text.push(' ');
266 }
267 models::content::ContentElement::TextBlock(tb) => {
268 all_text.push_str(&tb.value());
269 all_text.push(' ');
270 }
271 models::content::ContentElement::Paragraph(p) => {
272 all_text.push_str(&p.base.value());
273 all_text.push(' ');
274 }
275 models::content::ContentElement::Heading(h) => {
276 all_text.push_str(&h.base.base.value());
277 all_text.push(' ');
278 }
279 _ => {}
280 }
281 }
282
283 assert!(
284 all_text.contains("Hello"),
285 "Expected 'Hello' in extracted text, got: {}",
286 all_text
287 );
288 assert!(
289 all_text.contains("Second"),
290 "Expected 'Second' in extracted text, got: {}",
291 all_text
292 );
293
294 let _ = std::fs::remove_file(&pdf_path);
296 }
297}