1#![warn(missing_docs)]
8
9pub mod api;
10pub mod models;
11pub mod output;
12pub mod pdf;
13pub mod pipeline;
14pub mod utils;
15
16#[cfg(feature = "hybrid")]
17pub mod hybrid;
18
19pub mod tagged;
20
21use crate::api::config::ProcessingConfig;
22use crate::models::content::ContentElement;
23use crate::models::document::PdfDocument;
24use crate::pdf::chunk_parser::extract_page_chunks;
25use crate::pdf::page_info;
26#[cfg(not(target_arch = "wasm32"))]
27use crate::pdf::raster_table_ocr::{
28 recover_page_raster_table_cell_text, recover_raster_table_borders,
29};
30use crate::pipeline::orchestrator::{run_pipeline, PipelineState};
31use crate::tagged::struct_tree::build_mcid_map;
32
33#[cfg(not(target_arch = "wasm32"))]
45pub fn convert(
46 input_path: &std::path::Path,
47 config: &ProcessingConfig,
48) -> Result<PdfDocument, EdgePdfError> {
49 let raw_doc = pdf::loader::load_pdf(input_path, config.password.as_deref())?;
50
51 let page_info_list = page_info::extract_page_info(&raw_doc.document);
53
54 let pages_map = raw_doc.document.get_pages();
56 let mut page_contents = Vec::with_capacity(pages_map.len());
57
58 for (&page_num, &page_id) in &pages_map {
59 let page_chunks = extract_page_chunks(&raw_doc.document, page_num, page_id)?;
60 let mut recovered_tables = Vec::new();
61 if let Some(page_info) = page_info_list
62 .iter()
63 .find(|info| info.page_number == page_num)
64 {
65 recovered_tables = recover_raster_table_borders(
66 input_path,
67 &page_info.crop_box,
68 page_num,
69 &page_chunks.text_chunks,
70 &page_chunks.image_chunks,
71 );
72 }
73 let mut elements: Vec<ContentElement> = page_chunks
74 .text_chunks
75 .into_iter()
76 .map(ContentElement::TextChunk)
77 .collect();
78
79 elements.extend(
80 page_chunks
81 .image_chunks
82 .into_iter()
83 .map(ContentElement::Image),
84 );
85 elements.extend(
86 page_chunks
87 .line_chunks
88 .into_iter()
89 .map(ContentElement::Line),
90 );
91 elements.extend(
92 page_chunks
93 .line_art_chunks
94 .into_iter()
95 .map(ContentElement::LineArt),
96 );
97 elements.extend(
98 recovered_tables
99 .into_iter()
100 .map(ContentElement::TableBorder),
101 );
102
103 page_contents.push(elements);
104 }
105
106 let mcid_map = build_mcid_map(&raw_doc.document);
108 let mut pipeline_state = PipelineState::with_mcid_map(page_contents, config.clone(), mcid_map)
109 .with_page_info(page_info_list.clone());
110 run_pipeline(&mut pipeline_state)?;
111
112 let file_name = input_path
114 .file_name()
115 .and_then(|n| n.to_str())
116 .unwrap_or("unknown.pdf")
117 .to_string();
118
119 let mut doc = PdfDocument::new(file_name);
120 doc.source_path = Some(input_path.display().to_string());
121 doc.number_of_pages = pages_map.len() as u32;
122 doc.author = raw_doc.metadata.author;
123 doc.title = raw_doc.metadata.title;
124 doc.creation_date = raw_doc.metadata.creation_date;
125 doc.modification_date = raw_doc.metadata.modification_date;
126
127 for (page_idx, page) in pipeline_state.pages.iter_mut().enumerate() {
128 if let Some(page_info) = page_info_list.get(page_idx) {
129 recover_page_raster_table_cell_text(
130 input_path,
131 &page_info.crop_box,
132 page_info.page_number,
133 page,
134 );
135 }
136 }
137
138 for page in pipeline_state.pages {
140 doc.kids.extend(page);
141 }
142
143 Ok(doc)
144}
145
146pub fn convert_bytes(
162 data: &[u8],
163 file_name: &str,
164 config: &ProcessingConfig,
165) -> Result<PdfDocument, EdgePdfError> {
166 let raw_doc = pdf::loader::load_pdf_from_bytes(data, config.password.as_deref())?;
167
168 let page_info_list = page_info::extract_page_info(&raw_doc.document);
169
170 let pages_map = raw_doc.document.get_pages();
171 let mut page_contents = Vec::with_capacity(pages_map.len());
172
173 for (&page_num, &page_id) in &pages_map {
174 let page_chunks = extract_page_chunks(&raw_doc.document, page_num, page_id)?;
175
176 let recovered_tables = Vec::new();
178
179 let mut elements: Vec<ContentElement> = page_chunks
180 .text_chunks
181 .into_iter()
182 .map(ContentElement::TextChunk)
183 .collect();
184
185 elements.extend(
186 page_chunks
187 .image_chunks
188 .into_iter()
189 .map(ContentElement::Image),
190 );
191 elements.extend(
192 page_chunks
193 .line_chunks
194 .into_iter()
195 .map(ContentElement::Line),
196 );
197 elements.extend(
198 page_chunks
199 .line_art_chunks
200 .into_iter()
201 .map(ContentElement::LineArt),
202 );
203 elements.extend(
204 recovered_tables
205 .into_iter()
206 .map(ContentElement::TableBorder),
207 );
208
209 page_contents.push(elements);
210 }
211
212 let mcid_map = build_mcid_map(&raw_doc.document);
213 let mut pipeline_state = PipelineState::with_mcid_map(page_contents, config.clone(), mcid_map)
214 .with_page_info(page_info_list);
215 run_pipeline(&mut pipeline_state)?;
216
217 let mut doc = PdfDocument::new(file_name.to_string());
218 doc.number_of_pages = pages_map.len() as u32;
219 doc.author = raw_doc.metadata.author;
220 doc.title = raw_doc.metadata.title;
221 doc.creation_date = raw_doc.metadata.creation_date;
222 doc.modification_date = raw_doc.metadata.modification_date;
223
224 for page in pipeline_state.pages {
225 doc.kids.extend(page);
226 }
227
228 Ok(doc)
229}
230
231#[derive(Debug, thiserror::Error)]
233pub enum EdgePdfError {
234 #[error("PDF loading error: {0}")]
236 LoadError(String),
237
238 #[error("Pipeline error at stage {stage}: {message}")]
240 PipelineError {
241 stage: u32,
243 message: String,
245 },
246
247 #[error("Output error: {0}")]
249 OutputError(String),
250
251 #[error("I/O error: {0}")]
253 IoError(#[from] std::io::Error),
254
255 #[error("Configuration error: {0}")]
257 ConfigError(String),
258
259 #[error("PDF parse error: {0}")]
261 LopdfError(String),
262}
263
264impl From<lopdf::Error> for EdgePdfError {
265 fn from(e: lopdf::Error) -> Self {
266 EdgePdfError::LopdfError(e.to_string())
267 }
268}
269
270#[cfg(test)]
271mod tests {
272 use super::*;
273 use lopdf::{
274 content::{Content, Operation},
275 dictionary, Object, Stream,
276 };
277 use std::io::Write;
278
279 fn create_test_pdf_file(path: &std::path::Path) {
281 let mut doc = lopdf::Document::with_version("1.5");
282 let pages_id = doc.new_object_id();
283
284 let font_id = doc.add_object(dictionary! {
285 "Type" => "Font",
286 "Subtype" => "Type1",
287 "BaseFont" => "Helvetica",
288 });
289
290 let resources_id = doc.add_object(dictionary! {
291 "Font" => dictionary! {
292 "F1" => font_id,
293 },
294 });
295
296 let content = Content {
297 operations: vec![
298 Operation::new("BT", vec![]),
299 Operation::new("Tf", vec!["F1".into(), 12.into()]),
300 Operation::new("Td", vec![72.into(), 700.into()]),
301 Operation::new("Tj", vec![Object::string_literal("Hello EdgeParse!")]),
302 Operation::new("Td", vec![0.into(), Object::Real(-20.0)]),
303 Operation::new("Tj", vec![Object::string_literal("Second line of text.")]),
304 Operation::new("ET", vec![]),
305 ],
306 };
307
308 let encoded = content.encode().unwrap();
309 let content_id = doc.add_object(Stream::new(dictionary! {}, encoded));
310
311 let page_id = doc.add_object(dictionary! {
312 "Type" => "Page",
313 "Parent" => pages_id,
314 "Contents" => content_id,
315 "Resources" => resources_id,
316 "MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
317 });
318
319 let pages = dictionary! {
320 "Type" => "Pages",
321 "Kids" => vec![page_id.into()],
322 "Count" => 1,
323 };
324 doc.objects.insert(pages_id, Object::Dictionary(pages));
325
326 let catalog_id = doc.add_object(dictionary! {
327 "Type" => "Catalog",
328 "Pages" => pages_id,
329 });
330 doc.trailer.set("Root", catalog_id);
331
332 let mut file = std::fs::File::create(path).unwrap();
333 doc.save_to(&mut file).unwrap();
334 file.flush().unwrap();
335 }
336
337 #[test]
338 fn test_convert_end_to_end() {
339 let dir = std::env::temp_dir().join("edgeparse_test");
340 std::fs::create_dir_all(&dir).unwrap();
341 let pdf_path = dir.join("test_convert.pdf");
342
343 create_test_pdf_file(&pdf_path);
344
345 let config = ProcessingConfig::default();
346 let result = convert(&pdf_path, &config);
347 assert!(result.is_ok(), "convert() failed: {:?}", result.err());
348
349 let doc = result.unwrap();
350 assert_eq!(doc.number_of_pages, 1);
351 assert!(
352 !doc.kids.is_empty(),
353 "Expected content elements in document"
354 );
355
356 let mut all_text = String::new();
358 for element in &doc.kids {
359 match element {
360 models::content::ContentElement::TextChunk(tc) => {
361 all_text.push_str(&tc.value);
362 all_text.push(' ');
363 }
364 models::content::ContentElement::TextLine(tl) => {
365 all_text.push_str(&tl.value());
366 all_text.push(' ');
367 }
368 models::content::ContentElement::TextBlock(tb) => {
369 all_text.push_str(&tb.value());
370 all_text.push(' ');
371 }
372 models::content::ContentElement::Paragraph(p) => {
373 all_text.push_str(&p.base.value());
374 all_text.push(' ');
375 }
376 models::content::ContentElement::Heading(h) => {
377 all_text.push_str(&h.base.base.value());
378 all_text.push(' ');
379 }
380 _ => {}
381 }
382 }
383
384 assert!(
385 all_text.contains("Hello"),
386 "Expected 'Hello' in extracted text, got: {}",
387 all_text
388 );
389 assert!(
390 all_text.contains("Second"),
391 "Expected 'Second' in extracted text, got: {}",
392 all_text
393 );
394
395 let _ = std::fs::remove_file(&pdf_path);
397 }
398}