1#![warn(missing_docs)]
8
9pub mod api;
10pub mod models;
11pub mod output;
12pub mod pdf;
13pub mod pipeline;
14pub mod utils;
15
16#[cfg(feature = "hybrid")]
17pub mod hybrid;
18
19pub mod tagged;
20
21use crate::api::config::ProcessingConfig;
22use crate::models::content::ContentElement;
23use crate::models::document::PdfDocument;
24use crate::pdf::chunk_parser::extract_page_chunks;
25use crate::pdf::page_info;
26#[cfg(not(target_arch = "wasm32"))]
27use crate::pdf::raster_table_ocr::{
28 recover_page_raster_table_cell_text, recover_raster_table_borders,
29};
30use crate::pipeline::orchestrator::{run_pipeline, PipelineState};
31use crate::tagged::struct_tree::build_mcid_map;
32
33#[cfg(not(target_arch = "wasm32"))]
45pub fn convert(
46 input_path: &std::path::Path,
47 config: &ProcessingConfig,
48) -> Result<PdfDocument, EdgePdfError> {
49 let raw_doc = pdf::loader::load_pdf(input_path, config.password.as_deref())?;
50
51 let page_info_list = page_info::extract_page_info(&raw_doc.document);
53
54 let pages_map = raw_doc.document.get_pages();
56 let mut page_contents = Vec::with_capacity(pages_map.len());
57
58 for (&page_num, &page_id) in &pages_map {
59 let page_chunks = extract_page_chunks(&raw_doc.document, page_num, page_id)?;
60 let mut recovered_tables = Vec::new();
61 if config.raster_table_ocr {
62 if let Some(page_info) = page_info_list
63 .iter()
64 .find(|info| info.page_number == page_num)
65 {
66 recovered_tables = recover_raster_table_borders(
67 input_path,
68 &page_info.crop_box,
69 page_num,
70 &page_chunks.text_chunks,
71 &page_chunks.image_chunks,
72 );
73 }
74 }
75 let mut elements: Vec<ContentElement> = page_chunks
76 .text_chunks
77 .into_iter()
78 .map(ContentElement::TextChunk)
79 .collect();
80
81 elements.extend(
82 page_chunks
83 .image_chunks
84 .into_iter()
85 .map(ContentElement::Image),
86 );
87 elements.extend(
88 page_chunks
89 .line_chunks
90 .into_iter()
91 .map(ContentElement::Line),
92 );
93 elements.extend(
94 page_chunks
95 .line_art_chunks
96 .into_iter()
97 .map(ContentElement::LineArt),
98 );
99 elements.extend(
100 recovered_tables
101 .into_iter()
102 .map(ContentElement::TableBorder),
103 );
104
105 page_contents.push(elements);
106 }
107
108 let mcid_map = build_mcid_map(&raw_doc.document);
110 let mut pipeline_state = PipelineState::with_mcid_map(page_contents, config.clone(), mcid_map)
111 .with_page_info(page_info_list.clone());
112 run_pipeline(&mut pipeline_state)?;
113
114 let file_name = input_path
116 .file_name()
117 .and_then(|n| n.to_str())
118 .unwrap_or("unknown.pdf")
119 .to_string();
120
121 let mut doc = PdfDocument::new(file_name);
122 doc.source_path = Some(input_path.display().to_string());
123 doc.number_of_pages = pages_map.len() as u32;
124 doc.author = raw_doc.metadata.author;
125 doc.title = raw_doc.metadata.title;
126 doc.creation_date = raw_doc.metadata.creation_date;
127 doc.modification_date = raw_doc.metadata.modification_date;
128
129 if config.raster_table_ocr {
130 for (page_idx, page) in pipeline_state.pages.iter_mut().enumerate() {
131 if let Some(page_info) = page_info_list.get(page_idx) {
132 recover_page_raster_table_cell_text(
133 input_path,
134 &page_info.crop_box,
135 page_info.page_number,
136 page,
137 );
138 }
139 }
140 }
141
142 for page in pipeline_state.pages {
144 doc.kids.extend(page);
145 }
146
147 Ok(doc)
148}
149
150pub fn convert_bytes(
166 data: &[u8],
167 file_name: &str,
168 config: &ProcessingConfig,
169) -> Result<PdfDocument, EdgePdfError> {
170 let raw_doc = pdf::loader::load_pdf_from_bytes(data, config.password.as_deref())?;
171
172 let page_info_list = page_info::extract_page_info(&raw_doc.document);
173
174 let pages_map = raw_doc.document.get_pages();
175 let mut page_contents = Vec::with_capacity(pages_map.len());
176
177 for (&page_num, &page_id) in &pages_map {
178 let page_chunks = extract_page_chunks(&raw_doc.document, page_num, page_id)?;
179
180 let recovered_tables = Vec::new();
182
183 let mut elements: Vec<ContentElement> = page_chunks
184 .text_chunks
185 .into_iter()
186 .map(ContentElement::TextChunk)
187 .collect();
188
189 elements.extend(
190 page_chunks
191 .image_chunks
192 .into_iter()
193 .map(ContentElement::Image),
194 );
195 elements.extend(
196 page_chunks
197 .line_chunks
198 .into_iter()
199 .map(ContentElement::Line),
200 );
201 elements.extend(
202 page_chunks
203 .line_art_chunks
204 .into_iter()
205 .map(ContentElement::LineArt),
206 );
207 elements.extend(
208 recovered_tables
209 .into_iter()
210 .map(ContentElement::TableBorder),
211 );
212
213 page_contents.push(elements);
214 }
215
216 let mcid_map = build_mcid_map(&raw_doc.document);
217 let mut pipeline_state = PipelineState::with_mcid_map(page_contents, config.clone(), mcid_map)
218 .with_page_info(page_info_list);
219 run_pipeline(&mut pipeline_state)?;
220
221 let mut doc = PdfDocument::new(file_name.to_string());
222 doc.number_of_pages = pages_map.len() as u32;
223 doc.author = raw_doc.metadata.author;
224 doc.title = raw_doc.metadata.title;
225 doc.creation_date = raw_doc.metadata.creation_date;
226 doc.modification_date = raw_doc.metadata.modification_date;
227
228 for page in pipeline_state.pages {
229 doc.kids.extend(page);
230 }
231
232 Ok(doc)
233}
234
235#[derive(Debug, thiserror::Error)]
237pub enum EdgePdfError {
238 #[error("PDF loading error: {0}")]
240 LoadError(String),
241
242 #[error("Pipeline error at stage {stage}: {message}")]
244 PipelineError {
245 stage: u32,
247 message: String,
249 },
250
251 #[error("Output error: {0}")]
253 OutputError(String),
254
255 #[error("I/O error: {0}")]
257 IoError(#[from] std::io::Error),
258
259 #[error("Configuration error: {0}")]
261 ConfigError(String),
262
263 #[error("PDF parse error: {0}")]
265 LopdfError(String),
266}
267
268impl From<lopdf::Error> for EdgePdfError {
269 fn from(e: lopdf::Error) -> Self {
270 EdgePdfError::LopdfError(e.to_string())
271 }
272}
273
274#[cfg(test)]
275mod tests {
276 use super::*;
277 use lopdf::{
278 content::{Content, Operation},
279 dictionary, Object, Stream,
280 };
281 use std::io::Write;
282
283 fn create_test_pdf_file(path: &std::path::Path) {
285 let mut doc = lopdf::Document::with_version("1.5");
286 let pages_id = doc.new_object_id();
287
288 let font_id = doc.add_object(dictionary! {
289 "Type" => "Font",
290 "Subtype" => "Type1",
291 "BaseFont" => "Helvetica",
292 });
293
294 let resources_id = doc.add_object(dictionary! {
295 "Font" => dictionary! {
296 "F1" => font_id,
297 },
298 });
299
300 let content = Content {
301 operations: vec![
302 Operation::new("BT", vec![]),
303 Operation::new("Tf", vec!["F1".into(), 12.into()]),
304 Operation::new("Td", vec![72.into(), 700.into()]),
305 Operation::new("Tj", vec![Object::string_literal("Hello EdgeParse!")]),
306 Operation::new("Td", vec![0.into(), Object::Real(-20.0)]),
307 Operation::new("Tj", vec![Object::string_literal("Second line of text.")]),
308 Operation::new("ET", vec![]),
309 ],
310 };
311
312 let encoded = content.encode().unwrap();
313 let content_id = doc.add_object(Stream::new(dictionary! {}, encoded));
314
315 let page_id = doc.add_object(dictionary! {
316 "Type" => "Page",
317 "Parent" => pages_id,
318 "Contents" => content_id,
319 "Resources" => resources_id,
320 "MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
321 });
322
323 let pages = dictionary! {
324 "Type" => "Pages",
325 "Kids" => vec![page_id.into()],
326 "Count" => 1,
327 };
328 doc.objects.insert(pages_id, Object::Dictionary(pages));
329
330 let catalog_id = doc.add_object(dictionary! {
331 "Type" => "Catalog",
332 "Pages" => pages_id,
333 });
334 doc.trailer.set("Root", catalog_id);
335
336 let mut file = std::fs::File::create(path).unwrap();
337 doc.save_to(&mut file).unwrap();
338 file.flush().unwrap();
339 }
340
341 #[test]
342 fn test_convert_end_to_end() {
343 let dir = std::env::temp_dir().join("edgeparse_test");
344 std::fs::create_dir_all(&dir).unwrap();
345 let pdf_path = dir.join("test_convert.pdf");
346
347 create_test_pdf_file(&pdf_path);
348
349 let config = ProcessingConfig::default();
350 let result = convert(&pdf_path, &config);
351 assert!(result.is_ok(), "convert() failed: {:?}", result.err());
352
353 let doc = result.unwrap();
354 assert_eq!(doc.number_of_pages, 1);
355 assert!(
356 !doc.kids.is_empty(),
357 "Expected content elements in document"
358 );
359
360 let mut all_text = String::new();
362 for element in &doc.kids {
363 match element {
364 models::content::ContentElement::TextChunk(tc) => {
365 all_text.push_str(&tc.value);
366 all_text.push(' ');
367 }
368 models::content::ContentElement::TextLine(tl) => {
369 all_text.push_str(&tl.value());
370 all_text.push(' ');
371 }
372 models::content::ContentElement::TextBlock(tb) => {
373 all_text.push_str(&tb.value());
374 all_text.push(' ');
375 }
376 models::content::ContentElement::Paragraph(p) => {
377 all_text.push_str(&p.base.value());
378 all_text.push(' ');
379 }
380 models::content::ContentElement::Heading(h) => {
381 all_text.push_str(&h.base.base.value());
382 all_text.push(' ');
383 }
384 _ => {}
385 }
386 }
387
388 assert!(
389 all_text.contains("Hello"),
390 "Expected 'Hello' in extracted text, got: {}",
391 all_text
392 );
393 assert!(
394 all_text.contains("Second"),
395 "Expected 'Second' in extracted text, got: {}",
396 all_text
397 );
398
399 let _ = std::fs::remove_file(&pdf_path);
401 }
402}