Skip to main content

edgeparse_core/
lib.rs

1//! EdgeParse Core Library
2//!
3//! High-performance PDF-to-structured-data extraction engine.
4//! Implements a 20-stage processing pipeline for extracting text, tables,
5//! images, and semantic structure from PDF documents.
6
7#![warn(missing_docs)]
8
9pub mod api;
10pub mod models;
11pub mod output;
12pub mod pdf;
13pub mod pipeline;
14pub mod utils;
15
16#[cfg(feature = "hybrid")]
17pub mod hybrid;
18
19pub mod tagged;
20
21use crate::api::config::ProcessingConfig;
22use crate::models::content::ContentElement;
23use crate::models::document::PdfDocument;
24use crate::pdf::chunk_parser::extract_page_chunks;
25use crate::pdf::page_info;
26#[cfg(not(target_arch = "wasm32"))]
27use crate::pdf::raster_table_ocr::{
28    recover_page_raster_table_cell_text, recover_raster_table_borders,
29};
30use crate::pipeline::orchestrator::{run_pipeline, PipelineState};
31use crate::tagged::struct_tree::build_mcid_map;
32
33/// Main entry point: convert a PDF file to structured data.
34///
35/// # Arguments
36/// * `input_path` - Path to the input PDF file
37/// * `config` - Processing configuration
38///
39/// # Returns
40/// * `Result<PdfDocument>` - The extracted structured document
41///
42/// # Errors
43/// Returns an error if the PDF cannot be loaded or processed.
44#[cfg(not(target_arch = "wasm32"))]
45pub fn convert(
46    input_path: &std::path::Path,
47    config: &ProcessingConfig,
48) -> Result<PdfDocument, EdgePdfError> {
49    let raw_doc = pdf::loader::load_pdf(input_path, config.password.as_deref())?;
50
51    // Extract per-page geometry (MediaBox, CropBox, rotation) for use throughout the pipeline.
52    let page_info_list = page_info::extract_page_info(&raw_doc.document);
53
54    // Extract text chunks from each page
55    let pages_map = raw_doc.document.get_pages();
56    let mut page_contents = Vec::with_capacity(pages_map.len());
57
58    for (&page_num, &page_id) in &pages_map {
59        let page_chunks = extract_page_chunks(&raw_doc.document, page_num, page_id)?;
60        let mut recovered_tables = Vec::new();
61        if config.raster_table_ocr {
62            if let Some(page_info) = page_info_list
63                .iter()
64                .find(|info| info.page_number == page_num)
65            {
66                recovered_tables = recover_raster_table_borders(
67                    input_path,
68                    &page_info.crop_box,
69                    page_num,
70                    &page_chunks.text_chunks,
71                    &page_chunks.image_chunks,
72                );
73            }
74        }
75        let mut elements: Vec<ContentElement> = page_chunks
76            .text_chunks
77            .into_iter()
78            .map(ContentElement::TextChunk)
79            .collect();
80
81        elements.extend(
82            page_chunks
83                .image_chunks
84                .into_iter()
85                .map(ContentElement::Image),
86        );
87        elements.extend(
88            page_chunks
89                .line_chunks
90                .into_iter()
91                .map(ContentElement::Line),
92        );
93        elements.extend(
94            page_chunks
95                .line_art_chunks
96                .into_iter()
97                .map(ContentElement::LineArt),
98        );
99        elements.extend(
100            recovered_tables
101                .into_iter()
102                .map(ContentElement::TableBorder),
103        );
104
105        page_contents.push(elements);
106    }
107
108    // Run the processing pipeline
109    let mcid_map = build_mcid_map(&raw_doc.document);
110    let mut pipeline_state = PipelineState::with_mcid_map(page_contents, config.clone(), mcid_map)
111        .with_page_info(page_info_list.clone());
112    run_pipeline(&mut pipeline_state)?;
113
114    // Build the output document
115    let file_name = input_path
116        .file_name()
117        .and_then(|n| n.to_str())
118        .unwrap_or("unknown.pdf")
119        .to_string();
120
121    let mut doc = PdfDocument::new(file_name);
122    doc.source_path = Some(input_path.display().to_string());
123    doc.number_of_pages = pages_map.len() as u32;
124    doc.author = raw_doc.metadata.author;
125    doc.title = raw_doc.metadata.title;
126    doc.creation_date = raw_doc.metadata.creation_date;
127    doc.modification_date = raw_doc.metadata.modification_date;
128
129    if config.raster_table_ocr {
130        for (page_idx, page) in pipeline_state.pages.iter_mut().enumerate() {
131            if let Some(page_info) = page_info_list.get(page_idx) {
132                recover_page_raster_table_cell_text(
133                    input_path,
134                    &page_info.crop_box,
135                    page_info.page_number,
136                    page,
137                );
138            }
139        }
140    }
141
142    // Flatten pipeline output into document kids
143    for page in pipeline_state.pages {
144        doc.kids.extend(page);
145    }
146
147    Ok(doc)
148}
149
150/// Convert a PDF from an in-memory byte slice to structured data.
151///
152/// This is the WASM-compatible entry point. It replaces all filesystem
153/// operations with in-memory equivalents and skips raster table OCR.
154///
155/// # Arguments
156/// * `data` — raw PDF bytes (e.g., from a `Uint8Array` in JavaScript)
157/// * `file_name` — display name (used in `PdfDocument.file_name`)
158/// * `config` — processing configuration
159///
160/// # Returns
161/// Structured document or error.
162///
163/// # Errors
164/// Returns an error if the PDF cannot be parsed or processed.
165pub fn convert_bytes(
166    data: &[u8],
167    file_name: &str,
168    config: &ProcessingConfig,
169) -> Result<PdfDocument, EdgePdfError> {
170    let raw_doc = pdf::loader::load_pdf_from_bytes(data, config.password.as_deref())?;
171
172    let page_info_list = page_info::extract_page_info(&raw_doc.document);
173
174    let pages_map = raw_doc.document.get_pages();
175    let mut page_contents = Vec::with_capacity(pages_map.len());
176
177    for (&page_num, &page_id) in &pages_map {
178        let page_chunks = extract_page_chunks(&raw_doc.document, page_num, page_id)?;
179
180        // Raster table OCR requires external pdfimages binary — skip in memory-only mode
181        let recovered_tables = Vec::new();
182
183        let mut elements: Vec<ContentElement> = page_chunks
184            .text_chunks
185            .into_iter()
186            .map(ContentElement::TextChunk)
187            .collect();
188
189        elements.extend(
190            page_chunks
191                .image_chunks
192                .into_iter()
193                .map(ContentElement::Image),
194        );
195        elements.extend(
196            page_chunks
197                .line_chunks
198                .into_iter()
199                .map(ContentElement::Line),
200        );
201        elements.extend(
202            page_chunks
203                .line_art_chunks
204                .into_iter()
205                .map(ContentElement::LineArt),
206        );
207        elements.extend(
208            recovered_tables
209                .into_iter()
210                .map(ContentElement::TableBorder),
211        );
212
213        page_contents.push(elements);
214    }
215
216    let mcid_map = build_mcid_map(&raw_doc.document);
217    let mut pipeline_state = PipelineState::with_mcid_map(page_contents, config.clone(), mcid_map)
218        .with_page_info(page_info_list);
219    run_pipeline(&mut pipeline_state)?;
220
221    let mut doc = PdfDocument::new(file_name.to_string());
222    doc.number_of_pages = pages_map.len() as u32;
223    doc.author = raw_doc.metadata.author;
224    doc.title = raw_doc.metadata.title;
225    doc.creation_date = raw_doc.metadata.creation_date;
226    doc.modification_date = raw_doc.metadata.modification_date;
227
228    for page in pipeline_state.pages {
229        doc.kids.extend(page);
230    }
231
232    Ok(doc)
233}
234
235/// Top-level error type for EdgeParse operations.
236#[derive(Debug, thiserror::Error)]
237pub enum EdgePdfError {
238    /// PDF loading error
239    #[error("PDF loading error: {0}")]
240    LoadError(String),
241
242    /// Pipeline processing error
243    #[error("Pipeline error at stage {stage}: {message}")]
244    PipelineError {
245        /// Pipeline stage number (1-20)
246        stage: u32,
247        /// Error description
248        message: String,
249    },
250
251    /// Output generation error
252    #[error("Output error: {0}")]
253    OutputError(String),
254
255    /// I/O error
256    #[error("I/O error: {0}")]
257    IoError(#[from] std::io::Error),
258
259    /// Configuration error
260    #[error("Configuration error: {0}")]
261    ConfigError(String),
262
263    /// lopdf error
264    #[error("PDF parse error: {0}")]
265    LopdfError(String),
266}
267
268impl From<lopdf::Error> for EdgePdfError {
269    fn from(e: lopdf::Error) -> Self {
270        EdgePdfError::LopdfError(e.to_string())
271    }
272}
273
274#[cfg(test)]
275mod tests {
276    use super::*;
277    use lopdf::{
278        content::{Content, Operation},
279        dictionary, Object, Stream,
280    };
281    use std::io::Write;
282
283    /// Create a synthetic PDF file for integration testing.
284    fn create_test_pdf_file(path: &std::path::Path) {
285        let mut doc = lopdf::Document::with_version("1.5");
286        let pages_id = doc.new_object_id();
287
288        let font_id = doc.add_object(dictionary! {
289            "Type" => "Font",
290            "Subtype" => "Type1",
291            "BaseFont" => "Helvetica",
292        });
293
294        let resources_id = doc.add_object(dictionary! {
295            "Font" => dictionary! {
296                "F1" => font_id,
297            },
298        });
299
300        let content = Content {
301            operations: vec![
302                Operation::new("BT", vec![]),
303                Operation::new("Tf", vec!["F1".into(), 12.into()]),
304                Operation::new("Td", vec![72.into(), 700.into()]),
305                Operation::new("Tj", vec![Object::string_literal("Hello EdgeParse!")]),
306                Operation::new("Td", vec![0.into(), Object::Real(-20.0)]),
307                Operation::new("Tj", vec![Object::string_literal("Second line of text.")]),
308                Operation::new("ET", vec![]),
309            ],
310        };
311
312        let encoded = content.encode().unwrap();
313        let content_id = doc.add_object(Stream::new(dictionary! {}, encoded));
314
315        let page_id = doc.add_object(dictionary! {
316            "Type" => "Page",
317            "Parent" => pages_id,
318            "Contents" => content_id,
319            "Resources" => resources_id,
320            "MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
321        });
322
323        let pages = dictionary! {
324            "Type" => "Pages",
325            "Kids" => vec![page_id.into()],
326            "Count" => 1,
327        };
328        doc.objects.insert(pages_id, Object::Dictionary(pages));
329
330        let catalog_id = doc.add_object(dictionary! {
331            "Type" => "Catalog",
332            "Pages" => pages_id,
333        });
334        doc.trailer.set("Root", catalog_id);
335
336        let mut file = std::fs::File::create(path).unwrap();
337        doc.save_to(&mut file).unwrap();
338        file.flush().unwrap();
339    }
340
341    #[test]
342    fn test_convert_end_to_end() {
343        let dir = std::env::temp_dir().join("edgeparse_test");
344        std::fs::create_dir_all(&dir).unwrap();
345        let pdf_path = dir.join("test_convert.pdf");
346
347        create_test_pdf_file(&pdf_path);
348
349        let config = ProcessingConfig::default();
350        let result = convert(&pdf_path, &config);
351        assert!(result.is_ok(), "convert() failed: {:?}", result.err());
352
353        let doc = result.unwrap();
354        assert_eq!(doc.number_of_pages, 1);
355        assert!(
356            !doc.kids.is_empty(),
357            "Expected content elements in document"
358        );
359
360        // Check that we extracted content (may be TextChunks, TextLines, or TextBlocks after pipeline)
361        let mut all_text = String::new();
362        for element in &doc.kids {
363            match element {
364                models::content::ContentElement::TextChunk(tc) => {
365                    all_text.push_str(&tc.value);
366                    all_text.push(' ');
367                }
368                models::content::ContentElement::TextLine(tl) => {
369                    all_text.push_str(&tl.value());
370                    all_text.push(' ');
371                }
372                models::content::ContentElement::TextBlock(tb) => {
373                    all_text.push_str(&tb.value());
374                    all_text.push(' ');
375                }
376                models::content::ContentElement::Paragraph(p) => {
377                    all_text.push_str(&p.base.value());
378                    all_text.push(' ');
379                }
380                models::content::ContentElement::Heading(h) => {
381                    all_text.push_str(&h.base.base.value());
382                    all_text.push(' ');
383                }
384                _ => {}
385            }
386        }
387
388        assert!(
389            all_text.contains("Hello"),
390            "Expected 'Hello' in extracted text, got: {}",
391            all_text
392        );
393        assert!(
394            all_text.contains("Second"),
395            "Expected 'Second' in extracted text, got: {}",
396            all_text
397        );
398
399        // Cleanup
400        let _ = std::fs::remove_file(&pdf_path);
401    }
402}