Skip to main content

edgeparse_core/
lib.rs

1//! EdgeParse Core Library
2//!
3//! High-performance PDF-to-structured-data extraction engine.
4//! Implements a 20-stage processing pipeline for extracting text, tables,
5//! images, and semantic structure from PDF documents.
6
7#![warn(missing_docs)]
8
9pub mod api;
10pub mod models;
11pub mod output;
12pub mod pdf;
13pub mod pipeline;
14pub mod utils;
15
16#[cfg(feature = "hybrid")]
17pub mod hybrid;
18
19pub mod tagged;
20
21use crate::api::config::ProcessingConfig;
22use crate::models::content::ContentElement;
23use crate::models::document::PdfDocument;
24use crate::pdf::chunk_parser::extract_page_chunks;
25use crate::pdf::page_info;
26use crate::pdf::raster_table_ocr::recover_raster_table_borders;
27use crate::pipeline::orchestrator::{run_pipeline, PipelineState};
28use crate::tagged::struct_tree::build_mcid_map;
29
30/// Main entry point: convert a PDF file to structured data.
31///
32/// # Arguments
33/// * `input_path` - Path to the input PDF file
34/// * `config` - Processing configuration
35///
36/// # Returns
37/// * `Result<PdfDocument>` - The extracted structured document
38///
39/// # Errors
40/// Returns an error if the PDF cannot be loaded or processed.
41pub fn convert(
42    input_path: &std::path::Path,
43    config: &ProcessingConfig,
44) -> Result<PdfDocument, EdgePdfError> {
45    let raw_doc = pdf::loader::load_pdf(input_path, config.password.as_deref())?;
46
47    // Extract per-page geometry (MediaBox, CropBox, rotation) for use throughout the pipeline.
48    let page_info_list = page_info::extract_page_info(&raw_doc.document);
49
50    // Extract text chunks from each page
51    let pages_map = raw_doc.document.get_pages();
52    let mut page_contents = Vec::with_capacity(pages_map.len());
53
54    for (&page_num, &page_id) in &pages_map {
55        let page_chunks = extract_page_chunks(&raw_doc.document, page_num, page_id)?;
56        let mut recovered_tables = Vec::new();
57        if let Some(page_info) = page_info_list
58            .iter()
59            .find(|info| info.page_number == page_num)
60        {
61            recovered_tables = recover_raster_table_borders(
62                input_path,
63                &page_info.crop_box,
64                page_num,
65                &page_chunks.text_chunks,
66                &page_chunks.image_chunks,
67            );
68        }
69        let mut elements: Vec<ContentElement> = page_chunks
70            .text_chunks
71            .into_iter()
72            .map(ContentElement::TextChunk)
73            .collect();
74
75        elements.extend(
76            page_chunks
77                .image_chunks
78                .into_iter()
79                .map(ContentElement::Image),
80        );
81        elements.extend(
82            page_chunks
83                .line_chunks
84                .into_iter()
85                .map(ContentElement::Line),
86        );
87        elements.extend(
88            page_chunks
89                .line_art_chunks
90                .into_iter()
91                .map(ContentElement::LineArt),
92        );
93        elements.extend(
94            recovered_tables
95                .into_iter()
96                .map(ContentElement::TableBorder),
97        );
98
99        page_contents.push(elements);
100    }
101
102    // Run the processing pipeline
103    let mcid_map = build_mcid_map(&raw_doc.document);
104    let mut pipeline_state = PipelineState::with_mcid_map(page_contents, config.clone(), mcid_map)
105        .with_page_info(page_info_list);
106    run_pipeline(&mut pipeline_state)?;
107
108    // Build the output document
109    let file_name = input_path
110        .file_name()
111        .and_then(|n| n.to_str())
112        .unwrap_or("unknown.pdf")
113        .to_string();
114
115    let mut doc = PdfDocument::new(file_name);
116    doc.number_of_pages = pages_map.len() as u32;
117    doc.author = raw_doc.metadata.author;
118    doc.title = raw_doc.metadata.title;
119    doc.creation_date = raw_doc.metadata.creation_date;
120    doc.modification_date = raw_doc.metadata.modification_date;
121
122    // Flatten pipeline output into document kids
123    for page in pipeline_state.pages {
124        doc.kids.extend(page);
125    }
126
127    Ok(doc)
128}
129
130/// Top-level error type for EdgeParse operations.
131#[derive(Debug, thiserror::Error)]
132pub enum EdgePdfError {
133    /// PDF loading error
134    #[error("PDF loading error: {0}")]
135    LoadError(String),
136
137    /// Pipeline processing error
138    #[error("Pipeline error at stage {stage}: {message}")]
139    PipelineError {
140        /// Pipeline stage number (1-20)
141        stage: u32,
142        /// Error description
143        message: String,
144    },
145
146    /// Output generation error
147    #[error("Output error: {0}")]
148    OutputError(String),
149
150    /// I/O error
151    #[error("I/O error: {0}")]
152    IoError(#[from] std::io::Error),
153
154    /// Configuration error
155    #[error("Configuration error: {0}")]
156    ConfigError(String),
157
158    /// lopdf error
159    #[error("PDF parse error: {0}")]
160    LopdfError(String),
161}
162
163impl From<lopdf::Error> for EdgePdfError {
164    fn from(e: lopdf::Error) -> Self {
165        EdgePdfError::LopdfError(e.to_string())
166    }
167}
168
169#[cfg(test)]
170mod tests {
171    use super::*;
172    use lopdf::{
173        content::{Content, Operation},
174        dictionary, Object, Stream,
175    };
176    use std::io::Write;
177
178    /// Create a synthetic PDF file for integration testing.
179    fn create_test_pdf_file(path: &std::path::Path) {
180        let mut doc = lopdf::Document::with_version("1.5");
181        let pages_id = doc.new_object_id();
182
183        let font_id = doc.add_object(dictionary! {
184            "Type" => "Font",
185            "Subtype" => "Type1",
186            "BaseFont" => "Helvetica",
187        });
188
189        let resources_id = doc.add_object(dictionary! {
190            "Font" => dictionary! {
191                "F1" => font_id,
192            },
193        });
194
195        let content = Content {
196            operations: vec![
197                Operation::new("BT", vec![]),
198                Operation::new("Tf", vec!["F1".into(), 12.into()]),
199                Operation::new("Td", vec![72.into(), 700.into()]),
200                Operation::new("Tj", vec![Object::string_literal("Hello EdgeParse!")]),
201                Operation::new("Td", vec![0.into(), Object::Real(-20.0)]),
202                Operation::new("Tj", vec![Object::string_literal("Second line of text.")]),
203                Operation::new("ET", vec![]),
204            ],
205        };
206
207        let encoded = content.encode().unwrap();
208        let content_id = doc.add_object(Stream::new(dictionary! {}, encoded));
209
210        let page_id = doc.add_object(dictionary! {
211            "Type" => "Page",
212            "Parent" => pages_id,
213            "Contents" => content_id,
214            "Resources" => resources_id,
215            "MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
216        });
217
218        let pages = dictionary! {
219            "Type" => "Pages",
220            "Kids" => vec![page_id.into()],
221            "Count" => 1,
222        };
223        doc.objects.insert(pages_id, Object::Dictionary(pages));
224
225        let catalog_id = doc.add_object(dictionary! {
226            "Type" => "Catalog",
227            "Pages" => pages_id,
228        });
229        doc.trailer.set("Root", catalog_id);
230
231        let mut file = std::fs::File::create(path).unwrap();
232        doc.save_to(&mut file).unwrap();
233        file.flush().unwrap();
234    }
235
236    #[test]
237    fn test_convert_end_to_end() {
238        let dir = std::env::temp_dir().join("edgeparse_test");
239        std::fs::create_dir_all(&dir).unwrap();
240        let pdf_path = dir.join("test_convert.pdf");
241
242        create_test_pdf_file(&pdf_path);
243
244        let config = ProcessingConfig::default();
245        let result = convert(&pdf_path, &config);
246        assert!(result.is_ok(), "convert() failed: {:?}", result.err());
247
248        let doc = result.unwrap();
249        assert_eq!(doc.number_of_pages, 1);
250        assert!(
251            !doc.kids.is_empty(),
252            "Expected content elements in document"
253        );
254
255        // Check that we extracted content (may be TextChunks, TextLines, or TextBlocks after pipeline)
256        let mut all_text = String::new();
257        for element in &doc.kids {
258            match element {
259                models::content::ContentElement::TextChunk(tc) => {
260                    all_text.push_str(&tc.value);
261                    all_text.push(' ');
262                }
263                models::content::ContentElement::TextLine(tl) => {
264                    all_text.push_str(&tl.value());
265                    all_text.push(' ');
266                }
267                models::content::ContentElement::TextBlock(tb) => {
268                    all_text.push_str(&tb.value());
269                    all_text.push(' ');
270                }
271                models::content::ContentElement::Paragraph(p) => {
272                    all_text.push_str(&p.base.value());
273                    all_text.push(' ');
274                }
275                models::content::ContentElement::Heading(h) => {
276                    all_text.push_str(&h.base.base.value());
277                    all_text.push(' ');
278                }
279                _ => {}
280            }
281        }
282
283        assert!(
284            all_text.contains("Hello"),
285            "Expected 'Hello' in extracted text, got: {}",
286            all_text
287        );
288        assert!(
289            all_text.contains("Second"),
290            "Expected 'Second' in extracted text, got: {}",
291            all_text
292        );
293
294        // Cleanup
295        let _ = std::fs::remove_file(&pdf_path);
296    }
297}