Skip to main content

edgeparse_core/
lib.rs

1//! EdgeParse Core Library
2//!
3//! High-performance PDF-to-structured-data extraction engine.
4//! Implements a 20-stage processing pipeline for extracting text, tables,
5//! images, and semantic structure from PDF documents.
6
7#![warn(missing_docs)]
8
9pub mod api;
10pub mod models;
11pub mod output;
12pub mod pdf;
13pub mod pipeline;
14pub mod utils;
15
16#[cfg(feature = "hybrid")]
17pub mod hybrid;
18
19pub mod tagged;
20
21use crate::api::config::ProcessingConfig;
22use crate::models::content::ContentElement;
23use crate::models::document::PdfDocument;
24use crate::pdf::chunk_parser::extract_page_chunks;
25use crate::pdf::page_info;
26#[cfg(not(target_arch = "wasm32"))]
27use crate::pdf::raster_table_ocr::{
28    recover_dominant_image_text_chunks, recover_page_raster_table_cell_text,
29    recover_raster_table_borders,
30};
31use crate::pipeline::orchestrator::{run_pipeline, PipelineState};
32use crate::tagged::struct_tree::build_mcid_map;
33use std::time::Instant;
34
35/// Main entry point: convert a PDF file to structured data.
36///
37/// # Arguments
38/// * `input_path` - Path to the input PDF file
39/// * `config` - Processing configuration
40///
41/// # Returns
42/// * `Result<PdfDocument>` - The extracted structured document
43///
44/// # Errors
45/// Returns an error if the PDF cannot be loaded or processed.
46#[cfg(not(target_arch = "wasm32"))]
47pub fn convert(
48    input_path: &std::path::Path,
49    config: &ProcessingConfig,
50) -> Result<PdfDocument, EdgePdfError> {
51    let timing_enabled = timing_enabled();
52    let total_start = Instant::now();
53
54    let phase_start = Instant::now();
55    let raw_doc = pdf::loader::load_pdf(input_path, config.password.as_deref())?;
56    log_phase_duration(timing_enabled, "load_pdf", phase_start);
57
58    // Extract per-page geometry (MediaBox, CropBox, rotation) for use throughout the pipeline.
59    let phase_start = Instant::now();
60    let page_info_list = page_info::extract_page_info(&raw_doc.document);
61    log_phase_duration(timing_enabled, "extract_page_info", phase_start);
62
63    // Extract text chunks from each page
64    let pages_map = raw_doc.document.get_pages();
65    // Index by 1-based page number for fast lookup during optional OCR recovery.
66    // Keep this out of the default fast path when OCR is disabled.
67    let page_info_by_number: Vec<Option<&page_info::PageInfo>> =
68        if config.raster_table_ocr_enabled() {
69            let mut index = vec![None; pages_map.len().saturating_add(1)];
70            for info in &page_info_list {
71                if let Some(slot) = index.get_mut(info.page_number as usize) {
72                    *slot = Some(info);
73                }
74            }
75            index
76        } else {
77            Vec::new()
78        };
79    let mut page_contents = Vec::with_capacity(pages_map.len());
80
81    let phase_start = Instant::now();
82    for (&page_num, &page_id) in &pages_map {
83        let page_chunks = extract_page_chunks(&raw_doc.document, page_num, page_id)?;
84        let mut recovered_text_chunks = Vec::new();
85        let mut recovered_tables = Vec::new();
86        if config.raster_table_ocr_enabled() {
87            if let Some(Some(page_info)) = page_info_by_number.get(page_num as usize) {
88                recovered_text_chunks = recover_dominant_image_text_chunks(
89                    input_path,
90                    &page_info.crop_box,
91                    page_num,
92                    &page_chunks.text_chunks,
93                    &page_chunks.image_chunks,
94                );
95                recovered_tables = recover_raster_table_borders(
96                    input_path,
97                    &page_info.crop_box,
98                    page_num,
99                    &page_chunks.text_chunks,
100                    &page_chunks.image_chunks,
101                );
102            }
103        }
104        let mut elements: Vec<ContentElement> = page_chunks
105            .text_chunks
106            .into_iter()
107            .map(ContentElement::TextChunk)
108            .collect();
109        elements.extend(
110            recovered_text_chunks
111                .into_iter()
112                .map(ContentElement::TextChunk),
113        );
114
115        elements.extend(
116            page_chunks
117                .image_chunks
118                .into_iter()
119                .map(ContentElement::Image),
120        );
121        elements.extend(
122            page_chunks
123                .line_chunks
124                .into_iter()
125                .map(ContentElement::Line),
126        );
127        elements.extend(
128            page_chunks
129                .line_art_chunks
130                .into_iter()
131                .map(ContentElement::LineArt),
132        );
133        elements.extend(
134            recovered_tables
135                .into_iter()
136                .map(ContentElement::TableBorder),
137        );
138
139        page_contents.push(elements);
140    }
141    log_phase_duration(timing_enabled, "extract_page_chunks", phase_start);
142
143    // Run the processing pipeline
144    let phase_start = Instant::now();
145    let mcid_map = build_mcid_map(&raw_doc.document);
146    let mut pipeline_state = PipelineState::with_mcid_map(page_contents, config.clone(), mcid_map)
147        .with_page_info(page_info_list);
148    run_pipeline(&mut pipeline_state)?;
149    log_phase_duration(timing_enabled, "run_pipeline", phase_start);
150
151    // Build the output document
152    let file_name = input_path
153        .file_name()
154        .and_then(|n| n.to_str())
155        .unwrap_or("unknown.pdf")
156        .to_string();
157
158    let mut doc = PdfDocument::new(file_name);
159    doc.source_path = Some(input_path.display().to_string());
160    doc.number_of_pages = pages_map.len() as u32;
161    doc.author = raw_doc.metadata.author;
162    doc.title = raw_doc.metadata.title;
163    doc.creation_date = raw_doc.metadata.creation_date;
164    doc.modification_date = raw_doc.metadata.modification_date;
165
166    let phase_start = Instant::now();
167    if config.raster_table_ocr_enabled() {
168        for (page_idx, page) in pipeline_state.pages.iter_mut().enumerate() {
169            if let Some(page_info) = pipeline_state.page_info.get(page_idx) {
170                recover_page_raster_table_cell_text(
171                    input_path,
172                    &page_info.crop_box,
173                    page_info.page_number,
174                    page,
175                );
176            }
177        }
178    }
179    log_phase_duration(
180        timing_enabled,
181        "recover_page_raster_table_cell_text",
182        phase_start,
183    );
184
185    // Flatten pipeline output into document kids
186    let phase_start = Instant::now();
187    for page in pipeline_state.pages {
188        doc.kids.extend(page);
189    }
190    log_phase_duration(timing_enabled, "flatten_document", phase_start);
191    log_phase_duration(timing_enabled, "convert_total", total_start);
192
193    Ok(doc)
194}
195
196/// Convert a PDF from an in-memory byte slice to structured data.
197///
198/// This is the WASM-compatible entry point. It replaces all filesystem
199/// operations with in-memory equivalents and skips raster table OCR.
200///
201/// # Arguments
202/// * `data` — raw PDF bytes (e.g., from a `Uint8Array` in JavaScript)
203/// * `file_name` — display name (used in `PdfDocument.file_name`)
204/// * `config` — processing configuration
205///
206/// # Returns
207/// Structured document or error.
208///
209/// # Errors
210/// Returns an error if the PDF cannot be parsed or processed.
211pub fn convert_bytes(
212    data: &[u8],
213    file_name: &str,
214    config: &ProcessingConfig,
215) -> Result<PdfDocument, EdgePdfError> {
216    let raw_doc = pdf::loader::load_pdf_from_bytes(data, config.password.as_deref())?;
217
218    let page_info_list = page_info::extract_page_info(&raw_doc.document);
219
220    let pages_map = raw_doc.document.get_pages();
221    let mut page_contents = Vec::with_capacity(pages_map.len());
222
223    for (&page_num, &page_id) in &pages_map {
224        let page_chunks = extract_page_chunks(&raw_doc.document, page_num, page_id)?;
225
226        // Raster table OCR requires external pdfimages binary — skip in memory-only mode
227        let recovered_tables = Vec::new();
228
229        let mut elements: Vec<ContentElement> = page_chunks
230            .text_chunks
231            .into_iter()
232            .map(ContentElement::TextChunk)
233            .collect();
234
235        elements.extend(
236            page_chunks
237                .image_chunks
238                .into_iter()
239                .map(ContentElement::Image),
240        );
241        elements.extend(
242            page_chunks
243                .line_chunks
244                .into_iter()
245                .map(ContentElement::Line),
246        );
247        elements.extend(
248            page_chunks
249                .line_art_chunks
250                .into_iter()
251                .map(ContentElement::LineArt),
252        );
253        elements.extend(
254            recovered_tables
255                .into_iter()
256                .map(ContentElement::TableBorder),
257        );
258
259        page_contents.push(elements);
260    }
261
262    let mcid_map = build_mcid_map(&raw_doc.document);
263    let mut pipeline_state = PipelineState::with_mcid_map(page_contents, config.clone(), mcid_map)
264        .with_page_info(page_info_list);
265    run_pipeline(&mut pipeline_state)?;
266
267    let mut doc = PdfDocument::new(file_name.to_string());
268    doc.number_of_pages = pages_map.len() as u32;
269    doc.author = raw_doc.metadata.author;
270    doc.title = raw_doc.metadata.title;
271    doc.creation_date = raw_doc.metadata.creation_date;
272    doc.modification_date = raw_doc.metadata.modification_date;
273
274    for page in pipeline_state.pages {
275        doc.kids.extend(page);
276    }
277
278    Ok(doc)
279}
280
281/// Top-level error type for EdgeParse operations.
282#[derive(Debug, thiserror::Error)]
283pub enum EdgePdfError {
284    /// PDF loading error
285    #[error("PDF loading error: {0}")]
286    LoadError(String),
287
288    /// Pipeline processing error
289    #[error("Pipeline error at stage {stage}: {message}")]
290    PipelineError {
291        /// Pipeline stage number (1-20)
292        stage: u32,
293        /// Error description
294        message: String,
295    },
296
297    /// Output generation error
298    #[error("Output error: {0}")]
299    OutputError(String),
300
301    /// I/O error
302    #[error("I/O error: {0}")]
303    IoError(#[from] std::io::Error),
304
305    /// Configuration error
306    #[error("Configuration error: {0}")]
307    ConfigError(String),
308
309    /// lopdf error
310    #[error("PDF parse error: {0}")]
311    LopdfError(String),
312}
313
314impl From<lopdf::Error> for EdgePdfError {
315    fn from(e: lopdf::Error) -> Self {
316        EdgePdfError::LopdfError(e.to_string())
317    }
318}
319
320fn timing_enabled() -> bool {
321    std::env::var("EDGEPARSE_TIMING")
322        .map(|value| {
323            matches!(
324                value.to_ascii_lowercase().as_str(),
325                "1" | "true" | "yes" | "on"
326            )
327        })
328        .unwrap_or(false)
329}
330
331fn log_phase_duration(enabled: bool, phase: &str, start: Instant) {
332    if enabled {
333        log::info!(
334            "Timing {}: {:.2} ms",
335            phase,
336            start.elapsed().as_secs_f64() * 1000.0
337        );
338    }
339}
340
341#[cfg(test)]
342mod tests {
343    use super::*;
344    use lopdf::{
345        content::{Content, Operation},
346        dictionary, Object, Stream,
347    };
348    use std::io::Write;
349
350    /// Create a synthetic PDF file for integration testing.
351    fn create_test_pdf_file(path: &std::path::Path) {
352        let mut doc = lopdf::Document::with_version("1.5");
353        let pages_id = doc.new_object_id();
354
355        let font_id = doc.add_object(dictionary! {
356            "Type" => "Font",
357            "Subtype" => "Type1",
358            "BaseFont" => "Helvetica",
359        });
360
361        let resources_id = doc.add_object(dictionary! {
362            "Font" => dictionary! {
363                "F1" => font_id,
364            },
365        });
366
367        let content = Content {
368            operations: vec![
369                Operation::new("BT", vec![]),
370                Operation::new("Tf", vec!["F1".into(), 12.into()]),
371                Operation::new("Td", vec![72.into(), 700.into()]),
372                Operation::new("Tj", vec![Object::string_literal("Hello EdgeParse!")]),
373                Operation::new("Td", vec![0.into(), Object::Real(-20.0)]),
374                Operation::new("Tj", vec![Object::string_literal("Second line of text.")]),
375                Operation::new("ET", vec![]),
376            ],
377        };
378
379        let encoded = content.encode().unwrap();
380        let content_id = doc.add_object(Stream::new(dictionary! {}, encoded));
381
382        let page_id = doc.add_object(dictionary! {
383            "Type" => "Page",
384            "Parent" => pages_id,
385            "Contents" => content_id,
386            "Resources" => resources_id,
387            "MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
388        });
389
390        let pages = dictionary! {
391            "Type" => "Pages",
392            "Kids" => vec![page_id.into()],
393            "Count" => 1,
394        };
395        doc.objects.insert(pages_id, Object::Dictionary(pages));
396
397        let catalog_id = doc.add_object(dictionary! {
398            "Type" => "Catalog",
399            "Pages" => pages_id,
400        });
401        doc.trailer.set("Root", catalog_id);
402
403        let mut file = std::fs::File::create(path).unwrap();
404        doc.save_to(&mut file).unwrap();
405        file.flush().unwrap();
406    }
407
408    #[test]
409    fn test_convert_end_to_end() {
410        let dir = std::env::temp_dir().join("edgeparse_test");
411        std::fs::create_dir_all(&dir).unwrap();
412        let pdf_path = dir.join("test_convert.pdf");
413
414        create_test_pdf_file(&pdf_path);
415
416        let config = ProcessingConfig::default();
417        let result = convert(&pdf_path, &config);
418        assert!(result.is_ok(), "convert() failed: {:?}", result.err());
419
420        let doc = result.unwrap();
421        assert_eq!(doc.number_of_pages, 1);
422        assert!(
423            !doc.kids.is_empty(),
424            "Expected content elements in document"
425        );
426
427        // Check that we extracted content (may be TextChunks, TextLines, or TextBlocks after pipeline)
428        let mut all_text = String::new();
429        for element in &doc.kids {
430            match element {
431                models::content::ContentElement::TextChunk(tc) => {
432                    all_text.push_str(&tc.value);
433                    all_text.push(' ');
434                }
435                models::content::ContentElement::TextLine(tl) => {
436                    all_text.push_str(&tl.value());
437                    all_text.push(' ');
438                }
439                models::content::ContentElement::TextBlock(tb) => {
440                    all_text.push_str(&tb.value());
441                    all_text.push(' ');
442                }
443                models::content::ContentElement::Paragraph(p) => {
444                    all_text.push_str(&p.base.value());
445                    all_text.push(' ');
446                }
447                models::content::ContentElement::Heading(h) => {
448                    all_text.push_str(&h.base.base.value());
449                    all_text.push(' ');
450                }
451                _ => {}
452            }
453        }
454
455        assert!(
456            all_text.contains("Hello"),
457            "Expected 'Hello' in extracted text, got: {}",
458            all_text
459        );
460        assert!(
461            all_text.contains("Second"),
462            "Expected 'Second' in extracted text, got: {}",
463            all_text
464        );
465
466        // Cleanup
467        let _ = std::fs::remove_file(&pdf_path);
468    }
469}