1#![warn(missing_docs)]
8
9pub mod api;
10pub mod models;
11pub mod output;
12pub mod pdf;
13pub mod pipeline;
14pub mod utils;
15
16#[cfg(feature = "hybrid")]
17pub mod hybrid;
18
19pub mod tagged;
20
21use crate::api::config::ProcessingConfig;
22use crate::models::content::ContentElement;
23use crate::models::document::PdfDocument;
24use crate::pdf::chunk_parser::extract_page_chunks;
25use crate::pdf::page_info;
26#[cfg(not(target_arch = "wasm32"))]
27use crate::pdf::raster_table_ocr::{
28 recover_dominant_image_text_chunks, recover_page_raster_table_cell_text,
29 recover_raster_table_borders,
30};
31use crate::pipeline::orchestrator::{run_pipeline, PipelineState};
32use crate::tagged::struct_tree::build_mcid_map;
33use std::time::Instant;
34
35#[cfg(not(target_arch = "wasm32"))]
47pub fn convert(
48 input_path: &std::path::Path,
49 config: &ProcessingConfig,
50) -> Result<PdfDocument, EdgePdfError> {
51 let timing_enabled = timing_enabled();
52 let total_start = Instant::now();
53
54 let phase_start = Instant::now();
55 let raw_doc = pdf::loader::load_pdf(input_path, config.password.as_deref())?;
56 log_phase_duration(timing_enabled, "load_pdf", phase_start);
57
58 let phase_start = Instant::now();
60 let page_info_list = page_info::extract_page_info(&raw_doc.document);
61 log_phase_duration(timing_enabled, "extract_page_info", phase_start);
62
63 let pages_map = raw_doc.document.get_pages();
65 let page_info_by_number: Vec<Option<&page_info::PageInfo>> =
68 if config.raster_table_ocr_enabled() {
69 let mut index = vec![None; pages_map.len().saturating_add(1)];
70 for info in &page_info_list {
71 if let Some(slot) = index.get_mut(info.page_number as usize) {
72 *slot = Some(info);
73 }
74 }
75 index
76 } else {
77 Vec::new()
78 };
79 let mut page_contents = Vec::with_capacity(pages_map.len());
80
81 let phase_start = Instant::now();
82 for (&page_num, &page_id) in &pages_map {
83 let page_chunks = extract_page_chunks(&raw_doc.document, page_num, page_id)?;
84 let mut recovered_text_chunks = Vec::new();
85 let mut recovered_tables = Vec::new();
86 if config.raster_table_ocr_enabled() {
87 if let Some(Some(page_info)) = page_info_by_number.get(page_num as usize) {
88 recovered_text_chunks = recover_dominant_image_text_chunks(
89 input_path,
90 &page_info.crop_box,
91 page_num,
92 &page_chunks.text_chunks,
93 &page_chunks.image_chunks,
94 );
95 recovered_tables = recover_raster_table_borders(
96 input_path,
97 &page_info.crop_box,
98 page_num,
99 &page_chunks.text_chunks,
100 &page_chunks.image_chunks,
101 );
102 }
103 }
104 let mut elements: Vec<ContentElement> = page_chunks
105 .text_chunks
106 .into_iter()
107 .map(ContentElement::TextChunk)
108 .collect();
109 elements.extend(
110 recovered_text_chunks
111 .into_iter()
112 .map(ContentElement::TextChunk),
113 );
114
115 elements.extend(
116 page_chunks
117 .image_chunks
118 .into_iter()
119 .map(ContentElement::Image),
120 );
121 elements.extend(
122 page_chunks
123 .line_chunks
124 .into_iter()
125 .map(ContentElement::Line),
126 );
127 elements.extend(
128 page_chunks
129 .line_art_chunks
130 .into_iter()
131 .map(ContentElement::LineArt),
132 );
133 elements.extend(
134 recovered_tables
135 .into_iter()
136 .map(ContentElement::TableBorder),
137 );
138
139 page_contents.push(elements);
140 }
141 log_phase_duration(timing_enabled, "extract_page_chunks", phase_start);
142
143 let phase_start = Instant::now();
145 let mcid_map = build_mcid_map(&raw_doc.document);
146 let mut pipeline_state = PipelineState::with_mcid_map(page_contents, config.clone(), mcid_map)
147 .with_page_info(page_info_list);
148 run_pipeline(&mut pipeline_state)?;
149 log_phase_duration(timing_enabled, "run_pipeline", phase_start);
150
151 let file_name = input_path
153 .file_name()
154 .and_then(|n| n.to_str())
155 .unwrap_or("unknown.pdf")
156 .to_string();
157
158 let mut doc = PdfDocument::new(file_name);
159 doc.source_path = Some(input_path.display().to_string());
160 doc.number_of_pages = pages_map.len() as u32;
161 doc.author = raw_doc.metadata.author;
162 doc.title = raw_doc.metadata.title;
163 doc.creation_date = raw_doc.metadata.creation_date;
164 doc.modification_date = raw_doc.metadata.modification_date;
165
166 let phase_start = Instant::now();
167 if config.raster_table_ocr_enabled() {
168 for (page_idx, page) in pipeline_state.pages.iter_mut().enumerate() {
169 if let Some(page_info) = pipeline_state.page_info.get(page_idx) {
170 recover_page_raster_table_cell_text(
171 input_path,
172 &page_info.crop_box,
173 page_info.page_number,
174 page,
175 );
176 }
177 }
178 }
179 log_phase_duration(
180 timing_enabled,
181 "recover_page_raster_table_cell_text",
182 phase_start,
183 );
184
185 let phase_start = Instant::now();
187 for page in pipeline_state.pages {
188 doc.kids.extend(page);
189 }
190 log_phase_duration(timing_enabled, "flatten_document", phase_start);
191 log_phase_duration(timing_enabled, "convert_total", total_start);
192
193 Ok(doc)
194}
195
196pub fn convert_bytes(
212 data: &[u8],
213 file_name: &str,
214 config: &ProcessingConfig,
215) -> Result<PdfDocument, EdgePdfError> {
216 let raw_doc = pdf::loader::load_pdf_from_bytes(data, config.password.as_deref())?;
217
218 let page_info_list = page_info::extract_page_info(&raw_doc.document);
219
220 let pages_map = raw_doc.document.get_pages();
221 let mut page_contents = Vec::with_capacity(pages_map.len());
222
223 for (&page_num, &page_id) in &pages_map {
224 let page_chunks = extract_page_chunks(&raw_doc.document, page_num, page_id)?;
225
226 let recovered_tables = Vec::new();
228
229 let mut elements: Vec<ContentElement> = page_chunks
230 .text_chunks
231 .into_iter()
232 .map(ContentElement::TextChunk)
233 .collect();
234
235 elements.extend(
236 page_chunks
237 .image_chunks
238 .into_iter()
239 .map(ContentElement::Image),
240 );
241 elements.extend(
242 page_chunks
243 .line_chunks
244 .into_iter()
245 .map(ContentElement::Line),
246 );
247 elements.extend(
248 page_chunks
249 .line_art_chunks
250 .into_iter()
251 .map(ContentElement::LineArt),
252 );
253 elements.extend(
254 recovered_tables
255 .into_iter()
256 .map(ContentElement::TableBorder),
257 );
258
259 page_contents.push(elements);
260 }
261
262 let mcid_map = build_mcid_map(&raw_doc.document);
263 let mut pipeline_state = PipelineState::with_mcid_map(page_contents, config.clone(), mcid_map)
264 .with_page_info(page_info_list);
265 run_pipeline(&mut pipeline_state)?;
266
267 let mut doc = PdfDocument::new(file_name.to_string());
268 doc.number_of_pages = pages_map.len() as u32;
269 doc.author = raw_doc.metadata.author;
270 doc.title = raw_doc.metadata.title;
271 doc.creation_date = raw_doc.metadata.creation_date;
272 doc.modification_date = raw_doc.metadata.modification_date;
273
274 for page in pipeline_state.pages {
275 doc.kids.extend(page);
276 }
277
278 Ok(doc)
279}
280
281#[derive(Debug, thiserror::Error)]
283pub enum EdgePdfError {
284 #[error("PDF loading error: {0}")]
286 LoadError(String),
287
288 #[error("Pipeline error at stage {stage}: {message}")]
290 PipelineError {
291 stage: u32,
293 message: String,
295 },
296
297 #[error("Output error: {0}")]
299 OutputError(String),
300
301 #[error("I/O error: {0}")]
303 IoError(#[from] std::io::Error),
304
305 #[error("Configuration error: {0}")]
307 ConfigError(String),
308
309 #[error("PDF parse error: {0}")]
311 LopdfError(String),
312}
313
314impl From<lopdf::Error> for EdgePdfError {
315 fn from(e: lopdf::Error) -> Self {
316 EdgePdfError::LopdfError(e.to_string())
317 }
318}
319
320fn timing_enabled() -> bool {
321 std::env::var("EDGEPARSE_TIMING")
322 .map(|value| {
323 matches!(
324 value.to_ascii_lowercase().as_str(),
325 "1" | "true" | "yes" | "on"
326 )
327 })
328 .unwrap_or(false)
329}
330
331fn log_phase_duration(enabled: bool, phase: &str, start: Instant) {
332 if enabled {
333 log::info!(
334 "Timing {}: {:.2} ms",
335 phase,
336 start.elapsed().as_secs_f64() * 1000.0
337 );
338 }
339}
340
341#[cfg(test)]
342mod tests {
343 use super::*;
344 use lopdf::{
345 content::{Content, Operation},
346 dictionary, Object, Stream,
347 };
348 use std::io::Write;
349
350 fn create_test_pdf_file(path: &std::path::Path) {
352 let mut doc = lopdf::Document::with_version("1.5");
353 let pages_id = doc.new_object_id();
354
355 let font_id = doc.add_object(dictionary! {
356 "Type" => "Font",
357 "Subtype" => "Type1",
358 "BaseFont" => "Helvetica",
359 });
360
361 let resources_id = doc.add_object(dictionary! {
362 "Font" => dictionary! {
363 "F1" => font_id,
364 },
365 });
366
367 let content = Content {
368 operations: vec![
369 Operation::new("BT", vec![]),
370 Operation::new("Tf", vec!["F1".into(), 12.into()]),
371 Operation::new("Td", vec![72.into(), 700.into()]),
372 Operation::new("Tj", vec![Object::string_literal("Hello EdgeParse!")]),
373 Operation::new("Td", vec![0.into(), Object::Real(-20.0)]),
374 Operation::new("Tj", vec![Object::string_literal("Second line of text.")]),
375 Operation::new("ET", vec![]),
376 ],
377 };
378
379 let encoded = content.encode().unwrap();
380 let content_id = doc.add_object(Stream::new(dictionary! {}, encoded));
381
382 let page_id = doc.add_object(dictionary! {
383 "Type" => "Page",
384 "Parent" => pages_id,
385 "Contents" => content_id,
386 "Resources" => resources_id,
387 "MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
388 });
389
390 let pages = dictionary! {
391 "Type" => "Pages",
392 "Kids" => vec![page_id.into()],
393 "Count" => 1,
394 };
395 doc.objects.insert(pages_id, Object::Dictionary(pages));
396
397 let catalog_id = doc.add_object(dictionary! {
398 "Type" => "Catalog",
399 "Pages" => pages_id,
400 });
401 doc.trailer.set("Root", catalog_id);
402
403 let mut file = std::fs::File::create(path).unwrap();
404 doc.save_to(&mut file).unwrap();
405 file.flush().unwrap();
406 }
407
408 #[test]
409 fn test_convert_end_to_end() {
410 let dir = std::env::temp_dir().join("edgeparse_test");
411 std::fs::create_dir_all(&dir).unwrap();
412 let pdf_path = dir.join("test_convert.pdf");
413
414 create_test_pdf_file(&pdf_path);
415
416 let config = ProcessingConfig::default();
417 let result = convert(&pdf_path, &config);
418 assert!(result.is_ok(), "convert() failed: {:?}", result.err());
419
420 let doc = result.unwrap();
421 assert_eq!(doc.number_of_pages, 1);
422 assert!(
423 !doc.kids.is_empty(),
424 "Expected content elements in document"
425 );
426
427 let mut all_text = String::new();
429 for element in &doc.kids {
430 match element {
431 models::content::ContentElement::TextChunk(tc) => {
432 all_text.push_str(&tc.value);
433 all_text.push(' ');
434 }
435 models::content::ContentElement::TextLine(tl) => {
436 all_text.push_str(&tl.value());
437 all_text.push(' ');
438 }
439 models::content::ContentElement::TextBlock(tb) => {
440 all_text.push_str(&tb.value());
441 all_text.push(' ');
442 }
443 models::content::ContentElement::Paragraph(p) => {
444 all_text.push_str(&p.base.value());
445 all_text.push(' ');
446 }
447 models::content::ContentElement::Heading(h) => {
448 all_text.push_str(&h.base.base.value());
449 all_text.push(' ');
450 }
451 _ => {}
452 }
453 }
454
455 assert!(
456 all_text.contains("Hello"),
457 "Expected 'Hello' in extracted text, got: {}",
458 all_text
459 );
460 assert!(
461 all_text.contains("Second"),
462 "Expected 'Second' in extracted text, got: {}",
463 all_text
464 );
465
466 let _ = std::fs::remove_file(&pdf_path);
468 }
469}