Skip to main content

edgeparse_core/
lib.rs

1//! EdgeParse Core Library
2//!
3//! High-performance PDF-to-structured-data extraction engine.
4//! Implements a 20-stage processing pipeline for extracting text, tables,
5//! images, and semantic structure from PDF documents.
6
7#![warn(missing_docs)]
8
9pub mod api;
10pub mod models;
11pub mod output;
12pub mod pdf;
13pub mod pipeline;
14pub mod utils;
15
16#[cfg(feature = "hybrid")]
17pub mod hybrid;
18
19pub mod tagged;
20
21use crate::api::config::ProcessingConfig;
22use crate::models::content::ContentElement;
23use crate::models::document::PdfDocument;
24use crate::pdf::chunk_parser::extract_page_chunks;
25use crate::pdf::page_info;
26#[cfg(not(target_arch = "wasm32"))]
27use crate::pdf::raster_table_ocr::recover_raster_table_borders;
28use crate::pipeline::orchestrator::{run_pipeline, PipelineState};
29use crate::tagged::struct_tree::build_mcid_map;
30
31/// Main entry point: convert a PDF file to structured data.
32///
33/// # Arguments
34/// * `input_path` - Path to the input PDF file
35/// * `config` - Processing configuration
36///
37/// # Returns
38/// * `Result<PdfDocument>` - The extracted structured document
39///
40/// # Errors
41/// Returns an error if the PDF cannot be loaded or processed.
42#[cfg(not(target_arch = "wasm32"))]
43pub fn convert(
44    input_path: &std::path::Path,
45    config: &ProcessingConfig,
46) -> Result<PdfDocument, EdgePdfError> {
47    let raw_doc = pdf::loader::load_pdf(input_path, config.password.as_deref())?;
48
49    // Extract per-page geometry (MediaBox, CropBox, rotation) for use throughout the pipeline.
50    let page_info_list = page_info::extract_page_info(&raw_doc.document);
51
52    // Extract text chunks from each page
53    let pages_map = raw_doc.document.get_pages();
54    let mut page_contents = Vec::with_capacity(pages_map.len());
55
56    for (&page_num, &page_id) in &pages_map {
57        let page_chunks = extract_page_chunks(&raw_doc.document, page_num, page_id)?;
58        let mut recovered_tables = Vec::new();
59        if let Some(page_info) = page_info_list
60            .iter()
61            .find(|info| info.page_number == page_num)
62        {
63            recovered_tables = recover_raster_table_borders(
64                input_path,
65                &page_info.crop_box,
66                page_num,
67                &page_chunks.text_chunks,
68                &page_chunks.image_chunks,
69            );
70        }
71        let mut elements: Vec<ContentElement> = page_chunks
72            .text_chunks
73            .into_iter()
74            .map(ContentElement::TextChunk)
75            .collect();
76
77        elements.extend(
78            page_chunks
79                .image_chunks
80                .into_iter()
81                .map(ContentElement::Image),
82        );
83        elements.extend(
84            page_chunks
85                .line_chunks
86                .into_iter()
87                .map(ContentElement::Line),
88        );
89        elements.extend(
90            page_chunks
91                .line_art_chunks
92                .into_iter()
93                .map(ContentElement::LineArt),
94        );
95        elements.extend(
96            recovered_tables
97                .into_iter()
98                .map(ContentElement::TableBorder),
99        );
100
101        page_contents.push(elements);
102    }
103
104    // Run the processing pipeline
105    let mcid_map = build_mcid_map(&raw_doc.document);
106    let mut pipeline_state = PipelineState::with_mcid_map(page_contents, config.clone(), mcid_map)
107        .with_page_info(page_info_list);
108    run_pipeline(&mut pipeline_state)?;
109
110    // Build the output document
111    let file_name = input_path
112        .file_name()
113        .and_then(|n| n.to_str())
114        .unwrap_or("unknown.pdf")
115        .to_string();
116
117    let mut doc = PdfDocument::new(file_name);
118    doc.number_of_pages = pages_map.len() as u32;
119    doc.author = raw_doc.metadata.author;
120    doc.title = raw_doc.metadata.title;
121    doc.creation_date = raw_doc.metadata.creation_date;
122    doc.modification_date = raw_doc.metadata.modification_date;
123
124    // Flatten pipeline output into document kids
125    for page in pipeline_state.pages {
126        doc.kids.extend(page);
127    }
128
129    Ok(doc)
130}
131
132/// Convert a PDF from an in-memory byte slice to structured data.
133///
134/// This is the WASM-compatible entry point. It replaces all filesystem
135/// operations with in-memory equivalents and skips raster table OCR.
136///
137/// # Arguments
138/// * `data` — raw PDF bytes (e.g., from a `Uint8Array` in JavaScript)
139/// * `file_name` — display name (used in `PdfDocument.file_name`)
140/// * `config` — processing configuration
141///
142/// # Returns
143/// Structured document or error.
144///
145/// # Errors
146/// Returns an error if the PDF cannot be parsed or processed.
147pub fn convert_bytes(
148    data: &[u8],
149    file_name: &str,
150    config: &ProcessingConfig,
151) -> Result<PdfDocument, EdgePdfError> {
152    let raw_doc = pdf::loader::load_pdf_from_bytes(data, config.password.as_deref())?;
153
154    let page_info_list = page_info::extract_page_info(&raw_doc.document);
155
156    let pages_map = raw_doc.document.get_pages();
157    let mut page_contents = Vec::with_capacity(pages_map.len());
158
159    for (&page_num, &page_id) in &pages_map {
160        let page_chunks = extract_page_chunks(&raw_doc.document, page_num, page_id)?;
161
162        // Raster table OCR requires external pdfimages binary — skip in memory-only mode
163        let recovered_tables = Vec::new();
164
165        let mut elements: Vec<ContentElement> = page_chunks
166            .text_chunks
167            .into_iter()
168            .map(ContentElement::TextChunk)
169            .collect();
170
171        elements.extend(
172            page_chunks
173                .image_chunks
174                .into_iter()
175                .map(ContentElement::Image),
176        );
177        elements.extend(
178            page_chunks
179                .line_chunks
180                .into_iter()
181                .map(ContentElement::Line),
182        );
183        elements.extend(
184            page_chunks
185                .line_art_chunks
186                .into_iter()
187                .map(ContentElement::LineArt),
188        );
189        elements.extend(
190            recovered_tables
191                .into_iter()
192                .map(ContentElement::TableBorder),
193        );
194
195        page_contents.push(elements);
196    }
197
198    let mcid_map = build_mcid_map(&raw_doc.document);
199    let mut pipeline_state = PipelineState::with_mcid_map(page_contents, config.clone(), mcid_map)
200        .with_page_info(page_info_list);
201    run_pipeline(&mut pipeline_state)?;
202
203    let mut doc = PdfDocument::new(file_name.to_string());
204    doc.number_of_pages = pages_map.len() as u32;
205    doc.author = raw_doc.metadata.author;
206    doc.title = raw_doc.metadata.title;
207    doc.creation_date = raw_doc.metadata.creation_date;
208    doc.modification_date = raw_doc.metadata.modification_date;
209
210    for page in pipeline_state.pages {
211        doc.kids.extend(page);
212    }
213
214    Ok(doc)
215}
216
217/// Top-level error type for EdgeParse operations.
218#[derive(Debug, thiserror::Error)]
219pub enum EdgePdfError {
220    /// PDF loading error
221    #[error("PDF loading error: {0}")]
222    LoadError(String),
223
224    /// Pipeline processing error
225    #[error("Pipeline error at stage {stage}: {message}")]
226    PipelineError {
227        /// Pipeline stage number (1-20)
228        stage: u32,
229        /// Error description
230        message: String,
231    },
232
233    /// Output generation error
234    #[error("Output error: {0}")]
235    OutputError(String),
236
237    /// I/O error
238    #[error("I/O error: {0}")]
239    IoError(#[from] std::io::Error),
240
241    /// Configuration error
242    #[error("Configuration error: {0}")]
243    ConfigError(String),
244
245    /// lopdf error
246    #[error("PDF parse error: {0}")]
247    LopdfError(String),
248}
249
250impl From<lopdf::Error> for EdgePdfError {
251    fn from(e: lopdf::Error) -> Self {
252        EdgePdfError::LopdfError(e.to_string())
253    }
254}
255
256#[cfg(test)]
257mod tests {
258    use super::*;
259    use lopdf::{
260        content::{Content, Operation},
261        dictionary, Object, Stream,
262    };
263    use std::io::Write;
264
265    /// Create a synthetic PDF file for integration testing.
266    fn create_test_pdf_file(path: &std::path::Path) {
267        let mut doc = lopdf::Document::with_version("1.5");
268        let pages_id = doc.new_object_id();
269
270        let font_id = doc.add_object(dictionary! {
271            "Type" => "Font",
272            "Subtype" => "Type1",
273            "BaseFont" => "Helvetica",
274        });
275
276        let resources_id = doc.add_object(dictionary! {
277            "Font" => dictionary! {
278                "F1" => font_id,
279            },
280        });
281
282        let content = Content {
283            operations: vec![
284                Operation::new("BT", vec![]),
285                Operation::new("Tf", vec!["F1".into(), 12.into()]),
286                Operation::new("Td", vec![72.into(), 700.into()]),
287                Operation::new("Tj", vec![Object::string_literal("Hello EdgeParse!")]),
288                Operation::new("Td", vec![0.into(), Object::Real(-20.0)]),
289                Operation::new("Tj", vec![Object::string_literal("Second line of text.")]),
290                Operation::new("ET", vec![]),
291            ],
292        };
293
294        let encoded = content.encode().unwrap();
295        let content_id = doc.add_object(Stream::new(dictionary! {}, encoded));
296
297        let page_id = doc.add_object(dictionary! {
298            "Type" => "Page",
299            "Parent" => pages_id,
300            "Contents" => content_id,
301            "Resources" => resources_id,
302            "MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
303        });
304
305        let pages = dictionary! {
306            "Type" => "Pages",
307            "Kids" => vec![page_id.into()],
308            "Count" => 1,
309        };
310        doc.objects.insert(pages_id, Object::Dictionary(pages));
311
312        let catalog_id = doc.add_object(dictionary! {
313            "Type" => "Catalog",
314            "Pages" => pages_id,
315        });
316        doc.trailer.set("Root", catalog_id);
317
318        let mut file = std::fs::File::create(path).unwrap();
319        doc.save_to(&mut file).unwrap();
320        file.flush().unwrap();
321    }
322
323    #[test]
324    fn test_convert_end_to_end() {
325        let dir = std::env::temp_dir().join("edgeparse_test");
326        std::fs::create_dir_all(&dir).unwrap();
327        let pdf_path = dir.join("test_convert.pdf");
328
329        create_test_pdf_file(&pdf_path);
330
331        let config = ProcessingConfig::default();
332        let result = convert(&pdf_path, &config);
333        assert!(result.is_ok(), "convert() failed: {:?}", result.err());
334
335        let doc = result.unwrap();
336        assert_eq!(doc.number_of_pages, 1);
337        assert!(
338            !doc.kids.is_empty(),
339            "Expected content elements in document"
340        );
341
342        // Check that we extracted content (may be TextChunks, TextLines, or TextBlocks after pipeline)
343        let mut all_text = String::new();
344        for element in &doc.kids {
345            match element {
346                models::content::ContentElement::TextChunk(tc) => {
347                    all_text.push_str(&tc.value);
348                    all_text.push(' ');
349                }
350                models::content::ContentElement::TextLine(tl) => {
351                    all_text.push_str(&tl.value());
352                    all_text.push(' ');
353                }
354                models::content::ContentElement::TextBlock(tb) => {
355                    all_text.push_str(&tb.value());
356                    all_text.push(' ');
357                }
358                models::content::ContentElement::Paragraph(p) => {
359                    all_text.push_str(&p.base.value());
360                    all_text.push(' ');
361                }
362                models::content::ContentElement::Heading(h) => {
363                    all_text.push_str(&h.base.base.value());
364                    all_text.push(' ');
365                }
366                _ => {}
367            }
368        }
369
370        assert!(
371            all_text.contains("Hello"),
372            "Expected 'Hello' in extracted text, got: {}",
373            all_text
374        );
375        assert!(
376            all_text.contains("Second"),
377            "Expected 'Second' in extracted text, got: {}",
378            all_text
379        );
380
381        // Cleanup
382        let _ = std::fs::remove_file(&pdf_path);
383    }
384}