Skip to main content

edgeparse_core/
lib.rs

1//! EdgeParse Core Library
2//!
3//! High-performance PDF-to-structured-data extraction engine.
4//! Implements a 20-stage processing pipeline for extracting text, tables,
5//! images, and semantic structure from PDF documents.
6
7#![warn(missing_docs)]
8
9pub mod api;
10pub mod models;
11pub mod output;
12pub mod pdf;
13pub mod pipeline;
14pub mod utils;
15
16#[cfg(feature = "hybrid")]
17pub mod hybrid;
18
19pub mod tagged;
20
21use crate::api::config::ProcessingConfig;
22use crate::models::content::ContentElement;
23use crate::models::document::PdfDocument;
24use crate::pdf::chunk_parser::extract_page_chunks;
25use crate::pdf::page_info;
26#[cfg(not(target_arch = "wasm32"))]
27use crate::pdf::raster_table_ocr::{
28    recover_page_raster_table_cell_text, recover_raster_table_borders,
29};
30use crate::pipeline::orchestrator::{run_pipeline, PipelineState};
31use crate::tagged::struct_tree::build_mcid_map;
32
33/// Main entry point: convert a PDF file to structured data.
34///
35/// # Arguments
36/// * `input_path` - Path to the input PDF file
37/// * `config` - Processing configuration
38///
39/// # Returns
40/// * `Result<PdfDocument>` - The extracted structured document
41///
42/// # Errors
43/// Returns an error if the PDF cannot be loaded or processed.
44#[cfg(not(target_arch = "wasm32"))]
45pub fn convert(
46    input_path: &std::path::Path,
47    config: &ProcessingConfig,
48) -> Result<PdfDocument, EdgePdfError> {
49    let raw_doc = pdf::loader::load_pdf(input_path, config.password.as_deref())?;
50
51    // Extract per-page geometry (MediaBox, CropBox, rotation) for use throughout the pipeline.
52    let page_info_list = page_info::extract_page_info(&raw_doc.document);
53
54    // Extract text chunks from each page
55    let pages_map = raw_doc.document.get_pages();
56    let mut page_contents = Vec::with_capacity(pages_map.len());
57
58    for (&page_num, &page_id) in &pages_map {
59        let page_chunks = extract_page_chunks(&raw_doc.document, page_num, page_id)?;
60        let mut recovered_tables = Vec::new();
61        if let Some(page_info) = page_info_list
62            .iter()
63            .find(|info| info.page_number == page_num)
64        {
65            recovered_tables = recover_raster_table_borders(
66                input_path,
67                &page_info.crop_box,
68                page_num,
69                &page_chunks.text_chunks,
70                &page_chunks.image_chunks,
71            );
72        }
73        let mut elements: Vec<ContentElement> = page_chunks
74            .text_chunks
75            .into_iter()
76            .map(ContentElement::TextChunk)
77            .collect();
78
79        elements.extend(
80            page_chunks
81                .image_chunks
82                .into_iter()
83                .map(ContentElement::Image),
84        );
85        elements.extend(
86            page_chunks
87                .line_chunks
88                .into_iter()
89                .map(ContentElement::Line),
90        );
91        elements.extend(
92            page_chunks
93                .line_art_chunks
94                .into_iter()
95                .map(ContentElement::LineArt),
96        );
97        elements.extend(
98            recovered_tables
99                .into_iter()
100                .map(ContentElement::TableBorder),
101        );
102
103        page_contents.push(elements);
104    }
105
106    // Run the processing pipeline
107    let mcid_map = build_mcid_map(&raw_doc.document);
108    let mut pipeline_state = PipelineState::with_mcid_map(page_contents, config.clone(), mcid_map)
109        .with_page_info(page_info_list.clone());
110    run_pipeline(&mut pipeline_state)?;
111
112    // Build the output document
113    let file_name = input_path
114        .file_name()
115        .and_then(|n| n.to_str())
116        .unwrap_or("unknown.pdf")
117        .to_string();
118
119    let mut doc = PdfDocument::new(file_name);
120    doc.source_path = Some(input_path.display().to_string());
121    doc.number_of_pages = pages_map.len() as u32;
122    doc.author = raw_doc.metadata.author;
123    doc.title = raw_doc.metadata.title;
124    doc.creation_date = raw_doc.metadata.creation_date;
125    doc.modification_date = raw_doc.metadata.modification_date;
126
127    for (page_idx, page) in pipeline_state.pages.iter_mut().enumerate() {
128        if let Some(page_info) = page_info_list.get(page_idx) {
129            recover_page_raster_table_cell_text(
130                input_path,
131                &page_info.crop_box,
132                page_info.page_number,
133                page,
134            );
135        }
136    }
137
138    // Flatten pipeline output into document kids
139    for page in pipeline_state.pages {
140        doc.kids.extend(page);
141    }
142
143    Ok(doc)
144}
145
146/// Convert a PDF from an in-memory byte slice to structured data.
147///
148/// This is the WASM-compatible entry point. It replaces all filesystem
149/// operations with in-memory equivalents and skips raster table OCR.
150///
151/// # Arguments
152/// * `data` — raw PDF bytes (e.g., from a `Uint8Array` in JavaScript)
153/// * `file_name` — display name (used in `PdfDocument.file_name`)
154/// * `config` — processing configuration
155///
156/// # Returns
157/// Structured document or error.
158///
159/// # Errors
160/// Returns an error if the PDF cannot be parsed or processed.
161pub fn convert_bytes(
162    data: &[u8],
163    file_name: &str,
164    config: &ProcessingConfig,
165) -> Result<PdfDocument, EdgePdfError> {
166    let raw_doc = pdf::loader::load_pdf_from_bytes(data, config.password.as_deref())?;
167
168    let page_info_list = page_info::extract_page_info(&raw_doc.document);
169
170    let pages_map = raw_doc.document.get_pages();
171    let mut page_contents = Vec::with_capacity(pages_map.len());
172
173    for (&page_num, &page_id) in &pages_map {
174        let page_chunks = extract_page_chunks(&raw_doc.document, page_num, page_id)?;
175
176        // Raster table OCR requires external pdfimages binary — skip in memory-only mode
177        let recovered_tables = Vec::new();
178
179        let mut elements: Vec<ContentElement> = page_chunks
180            .text_chunks
181            .into_iter()
182            .map(ContentElement::TextChunk)
183            .collect();
184
185        elements.extend(
186            page_chunks
187                .image_chunks
188                .into_iter()
189                .map(ContentElement::Image),
190        );
191        elements.extend(
192            page_chunks
193                .line_chunks
194                .into_iter()
195                .map(ContentElement::Line),
196        );
197        elements.extend(
198            page_chunks
199                .line_art_chunks
200                .into_iter()
201                .map(ContentElement::LineArt),
202        );
203        elements.extend(
204            recovered_tables
205                .into_iter()
206                .map(ContentElement::TableBorder),
207        );
208
209        page_contents.push(elements);
210    }
211
212    let mcid_map = build_mcid_map(&raw_doc.document);
213    let mut pipeline_state = PipelineState::with_mcid_map(page_contents, config.clone(), mcid_map)
214        .with_page_info(page_info_list);
215    run_pipeline(&mut pipeline_state)?;
216
217    let mut doc = PdfDocument::new(file_name.to_string());
218    doc.number_of_pages = pages_map.len() as u32;
219    doc.author = raw_doc.metadata.author;
220    doc.title = raw_doc.metadata.title;
221    doc.creation_date = raw_doc.metadata.creation_date;
222    doc.modification_date = raw_doc.metadata.modification_date;
223
224    for page in pipeline_state.pages {
225        doc.kids.extend(page);
226    }
227
228    Ok(doc)
229}
230
231/// Top-level error type for EdgeParse operations.
232#[derive(Debug, thiserror::Error)]
233pub enum EdgePdfError {
234    /// PDF loading error
235    #[error("PDF loading error: {0}")]
236    LoadError(String),
237
238    /// Pipeline processing error
239    #[error("Pipeline error at stage {stage}: {message}")]
240    PipelineError {
241        /// Pipeline stage number (1-20)
242        stage: u32,
243        /// Error description
244        message: String,
245    },
246
247    /// Output generation error
248    #[error("Output error: {0}")]
249    OutputError(String),
250
251    /// I/O error
252    #[error("I/O error: {0}")]
253    IoError(#[from] std::io::Error),
254
255    /// Configuration error
256    #[error("Configuration error: {0}")]
257    ConfigError(String),
258
259    /// lopdf error
260    #[error("PDF parse error: {0}")]
261    LopdfError(String),
262}
263
264impl From<lopdf::Error> for EdgePdfError {
265    fn from(e: lopdf::Error) -> Self {
266        EdgePdfError::LopdfError(e.to_string())
267    }
268}
269
270#[cfg(test)]
271mod tests {
272    use super::*;
273    use lopdf::{
274        content::{Content, Operation},
275        dictionary, Object, Stream,
276    };
277    use std::io::Write;
278
279    /// Create a synthetic PDF file for integration testing.
280    fn create_test_pdf_file(path: &std::path::Path) {
281        let mut doc = lopdf::Document::with_version("1.5");
282        let pages_id = doc.new_object_id();
283
284        let font_id = doc.add_object(dictionary! {
285            "Type" => "Font",
286            "Subtype" => "Type1",
287            "BaseFont" => "Helvetica",
288        });
289
290        let resources_id = doc.add_object(dictionary! {
291            "Font" => dictionary! {
292                "F1" => font_id,
293            },
294        });
295
296        let content = Content {
297            operations: vec![
298                Operation::new("BT", vec![]),
299                Operation::new("Tf", vec!["F1".into(), 12.into()]),
300                Operation::new("Td", vec![72.into(), 700.into()]),
301                Operation::new("Tj", vec![Object::string_literal("Hello EdgeParse!")]),
302                Operation::new("Td", vec![0.into(), Object::Real(-20.0)]),
303                Operation::new("Tj", vec![Object::string_literal("Second line of text.")]),
304                Operation::new("ET", vec![]),
305            ],
306        };
307
308        let encoded = content.encode().unwrap();
309        let content_id = doc.add_object(Stream::new(dictionary! {}, encoded));
310
311        let page_id = doc.add_object(dictionary! {
312            "Type" => "Page",
313            "Parent" => pages_id,
314            "Contents" => content_id,
315            "Resources" => resources_id,
316            "MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
317        });
318
319        let pages = dictionary! {
320            "Type" => "Pages",
321            "Kids" => vec![page_id.into()],
322            "Count" => 1,
323        };
324        doc.objects.insert(pages_id, Object::Dictionary(pages));
325
326        let catalog_id = doc.add_object(dictionary! {
327            "Type" => "Catalog",
328            "Pages" => pages_id,
329        });
330        doc.trailer.set("Root", catalog_id);
331
332        let mut file = std::fs::File::create(path).unwrap();
333        doc.save_to(&mut file).unwrap();
334        file.flush().unwrap();
335    }
336
337    #[test]
338    fn test_convert_end_to_end() {
339        let dir = std::env::temp_dir().join("edgeparse_test");
340        std::fs::create_dir_all(&dir).unwrap();
341        let pdf_path = dir.join("test_convert.pdf");
342
343        create_test_pdf_file(&pdf_path);
344
345        let config = ProcessingConfig::default();
346        let result = convert(&pdf_path, &config);
347        assert!(result.is_ok(), "convert() failed: {:?}", result.err());
348
349        let doc = result.unwrap();
350        assert_eq!(doc.number_of_pages, 1);
351        assert!(
352            !doc.kids.is_empty(),
353            "Expected content elements in document"
354        );
355
356        // Check that we extracted content (may be TextChunks, TextLines, or TextBlocks after pipeline)
357        let mut all_text = String::new();
358        for element in &doc.kids {
359            match element {
360                models::content::ContentElement::TextChunk(tc) => {
361                    all_text.push_str(&tc.value);
362                    all_text.push(' ');
363                }
364                models::content::ContentElement::TextLine(tl) => {
365                    all_text.push_str(&tl.value());
366                    all_text.push(' ');
367                }
368                models::content::ContentElement::TextBlock(tb) => {
369                    all_text.push_str(&tb.value());
370                    all_text.push(' ');
371                }
372                models::content::ContentElement::Paragraph(p) => {
373                    all_text.push_str(&p.base.value());
374                    all_text.push(' ');
375                }
376                models::content::ContentElement::Heading(h) => {
377                    all_text.push_str(&h.base.base.value());
378                    all_text.push(' ');
379                }
380                _ => {}
381            }
382        }
383
384        assert!(
385            all_text.contains("Hello"),
386            "Expected 'Hello' in extracted text, got: {}",
387            all_text
388        );
389        assert!(
390            all_text.contains("Second"),
391            "Expected 'Second' in extracted text, got: {}",
392            all_text
393        );
394
395        // Cleanup
396        let _ = std::fs::remove_file(&pdf_path);
397    }
398}