1#![warn(missing_docs)]
8
9pub mod api;
10pub mod models;
11pub mod output;
12pub mod pdf;
13pub mod pipeline;
14pub mod utils;
15
16#[cfg(feature = "hybrid")]
17pub mod hybrid;
18
19pub mod tagged;
20
21use crate::api::config::ProcessingConfig;
22use crate::models::content::ContentElement;
23use crate::models::document::PdfDocument;
24use crate::pdf::chunk_parser::extract_page_chunks;
25use crate::pdf::page_info;
26#[cfg(not(target_arch = "wasm32"))]
27use crate::pdf::raster_table_ocr::recover_raster_table_borders;
28use crate::pipeline::orchestrator::{run_pipeline, PipelineState};
29use crate::tagged::struct_tree::build_mcid_map;
30
31#[cfg(not(target_arch = "wasm32"))]
43pub fn convert(
44 input_path: &std::path::Path,
45 config: &ProcessingConfig,
46) -> Result<PdfDocument, EdgePdfError> {
47 let raw_doc = pdf::loader::load_pdf(input_path, config.password.as_deref())?;
48
49 let page_info_list = page_info::extract_page_info(&raw_doc.document);
51
52 let pages_map = raw_doc.document.get_pages();
54 let mut page_contents = Vec::with_capacity(pages_map.len());
55
56 for (&page_num, &page_id) in &pages_map {
57 let page_chunks = extract_page_chunks(&raw_doc.document, page_num, page_id)?;
58 let mut recovered_tables = Vec::new();
59 if let Some(page_info) = page_info_list
60 .iter()
61 .find(|info| info.page_number == page_num)
62 {
63 recovered_tables = recover_raster_table_borders(
64 input_path,
65 &page_info.crop_box,
66 page_num,
67 &page_chunks.text_chunks,
68 &page_chunks.image_chunks,
69 );
70 }
71 let mut elements: Vec<ContentElement> = page_chunks
72 .text_chunks
73 .into_iter()
74 .map(ContentElement::TextChunk)
75 .collect();
76
77 elements.extend(
78 page_chunks
79 .image_chunks
80 .into_iter()
81 .map(ContentElement::Image),
82 );
83 elements.extend(
84 page_chunks
85 .line_chunks
86 .into_iter()
87 .map(ContentElement::Line),
88 );
89 elements.extend(
90 page_chunks
91 .line_art_chunks
92 .into_iter()
93 .map(ContentElement::LineArt),
94 );
95 elements.extend(
96 recovered_tables
97 .into_iter()
98 .map(ContentElement::TableBorder),
99 );
100
101 page_contents.push(elements);
102 }
103
104 let mcid_map = build_mcid_map(&raw_doc.document);
106 let mut pipeline_state = PipelineState::with_mcid_map(page_contents, config.clone(), mcid_map)
107 .with_page_info(page_info_list);
108 run_pipeline(&mut pipeline_state)?;
109
110 let file_name = input_path
112 .file_name()
113 .and_then(|n| n.to_str())
114 .unwrap_or("unknown.pdf")
115 .to_string();
116
117 let mut doc = PdfDocument::new(file_name);
118 doc.number_of_pages = pages_map.len() as u32;
119 doc.author = raw_doc.metadata.author;
120 doc.title = raw_doc.metadata.title;
121 doc.creation_date = raw_doc.metadata.creation_date;
122 doc.modification_date = raw_doc.metadata.modification_date;
123
124 for page in pipeline_state.pages {
126 doc.kids.extend(page);
127 }
128
129 Ok(doc)
130}
131
132pub fn convert_bytes(
148 data: &[u8],
149 file_name: &str,
150 config: &ProcessingConfig,
151) -> Result<PdfDocument, EdgePdfError> {
152 let raw_doc = pdf::loader::load_pdf_from_bytes(data, config.password.as_deref())?;
153
154 let page_info_list = page_info::extract_page_info(&raw_doc.document);
155
156 let pages_map = raw_doc.document.get_pages();
157 let mut page_contents = Vec::with_capacity(pages_map.len());
158
159 for (&page_num, &page_id) in &pages_map {
160 let page_chunks = extract_page_chunks(&raw_doc.document, page_num, page_id)?;
161
162 let recovered_tables = Vec::new();
164
165 let mut elements: Vec<ContentElement> = page_chunks
166 .text_chunks
167 .into_iter()
168 .map(ContentElement::TextChunk)
169 .collect();
170
171 elements.extend(
172 page_chunks
173 .image_chunks
174 .into_iter()
175 .map(ContentElement::Image),
176 );
177 elements.extend(
178 page_chunks
179 .line_chunks
180 .into_iter()
181 .map(ContentElement::Line),
182 );
183 elements.extend(
184 page_chunks
185 .line_art_chunks
186 .into_iter()
187 .map(ContentElement::LineArt),
188 );
189 elements.extend(
190 recovered_tables
191 .into_iter()
192 .map(ContentElement::TableBorder),
193 );
194
195 page_contents.push(elements);
196 }
197
198 let mcid_map = build_mcid_map(&raw_doc.document);
199 let mut pipeline_state = PipelineState::with_mcid_map(page_contents, config.clone(), mcid_map)
200 .with_page_info(page_info_list);
201 run_pipeline(&mut pipeline_state)?;
202
203 let mut doc = PdfDocument::new(file_name.to_string());
204 doc.number_of_pages = pages_map.len() as u32;
205 doc.author = raw_doc.metadata.author;
206 doc.title = raw_doc.metadata.title;
207 doc.creation_date = raw_doc.metadata.creation_date;
208 doc.modification_date = raw_doc.metadata.modification_date;
209
210 for page in pipeline_state.pages {
211 doc.kids.extend(page);
212 }
213
214 Ok(doc)
215}
216
217#[derive(Debug, thiserror::Error)]
219pub enum EdgePdfError {
220 #[error("PDF loading error: {0}")]
222 LoadError(String),
223
224 #[error("Pipeline error at stage {stage}: {message}")]
226 PipelineError {
227 stage: u32,
229 message: String,
231 },
232
233 #[error("Output error: {0}")]
235 OutputError(String),
236
237 #[error("I/O error: {0}")]
239 IoError(#[from] std::io::Error),
240
241 #[error("Configuration error: {0}")]
243 ConfigError(String),
244
245 #[error("PDF parse error: {0}")]
247 LopdfError(String),
248}
249
250impl From<lopdf::Error> for EdgePdfError {
251 fn from(e: lopdf::Error) -> Self {
252 EdgePdfError::LopdfError(e.to_string())
253 }
254}
255
256#[cfg(test)]
257mod tests {
258 use super::*;
259 use lopdf::{
260 content::{Content, Operation},
261 dictionary, Object, Stream,
262 };
263 use std::io::Write;
264
265 fn create_test_pdf_file(path: &std::path::Path) {
267 let mut doc = lopdf::Document::with_version("1.5");
268 let pages_id = doc.new_object_id();
269
270 let font_id = doc.add_object(dictionary! {
271 "Type" => "Font",
272 "Subtype" => "Type1",
273 "BaseFont" => "Helvetica",
274 });
275
276 let resources_id = doc.add_object(dictionary! {
277 "Font" => dictionary! {
278 "F1" => font_id,
279 },
280 });
281
282 let content = Content {
283 operations: vec![
284 Operation::new("BT", vec![]),
285 Operation::new("Tf", vec!["F1".into(), 12.into()]),
286 Operation::new("Td", vec![72.into(), 700.into()]),
287 Operation::new("Tj", vec![Object::string_literal("Hello EdgeParse!")]),
288 Operation::new("Td", vec![0.into(), Object::Real(-20.0)]),
289 Operation::new("Tj", vec![Object::string_literal("Second line of text.")]),
290 Operation::new("ET", vec![]),
291 ],
292 };
293
294 let encoded = content.encode().unwrap();
295 let content_id = doc.add_object(Stream::new(dictionary! {}, encoded));
296
297 let page_id = doc.add_object(dictionary! {
298 "Type" => "Page",
299 "Parent" => pages_id,
300 "Contents" => content_id,
301 "Resources" => resources_id,
302 "MediaBox" => vec![0.into(), 0.into(), 595.into(), 842.into()],
303 });
304
305 let pages = dictionary! {
306 "Type" => "Pages",
307 "Kids" => vec![page_id.into()],
308 "Count" => 1,
309 };
310 doc.objects.insert(pages_id, Object::Dictionary(pages));
311
312 let catalog_id = doc.add_object(dictionary! {
313 "Type" => "Catalog",
314 "Pages" => pages_id,
315 });
316 doc.trailer.set("Root", catalog_id);
317
318 let mut file = std::fs::File::create(path).unwrap();
319 doc.save_to(&mut file).unwrap();
320 file.flush().unwrap();
321 }
322
323 #[test]
324 fn test_convert_end_to_end() {
325 let dir = std::env::temp_dir().join("edgeparse_test");
326 std::fs::create_dir_all(&dir).unwrap();
327 let pdf_path = dir.join("test_convert.pdf");
328
329 create_test_pdf_file(&pdf_path);
330
331 let config = ProcessingConfig::default();
332 let result = convert(&pdf_path, &config);
333 assert!(result.is_ok(), "convert() failed: {:?}", result.err());
334
335 let doc = result.unwrap();
336 assert_eq!(doc.number_of_pages, 1);
337 assert!(
338 !doc.kids.is_empty(),
339 "Expected content elements in document"
340 );
341
342 let mut all_text = String::new();
344 for element in &doc.kids {
345 match element {
346 models::content::ContentElement::TextChunk(tc) => {
347 all_text.push_str(&tc.value);
348 all_text.push(' ');
349 }
350 models::content::ContentElement::TextLine(tl) => {
351 all_text.push_str(&tl.value());
352 all_text.push(' ');
353 }
354 models::content::ContentElement::TextBlock(tb) => {
355 all_text.push_str(&tb.value());
356 all_text.push(' ');
357 }
358 models::content::ContentElement::Paragraph(p) => {
359 all_text.push_str(&p.base.value());
360 all_text.push(' ');
361 }
362 models::content::ContentElement::Heading(h) => {
363 all_text.push_str(&h.base.base.value());
364 all_text.push(' ');
365 }
366 _ => {}
367 }
368 }
369
370 assert!(
371 all_text.contains("Hello"),
372 "Expected 'Hello' in extracted text, got: {}",
373 all_text
374 );
375 assert!(
376 all_text.contains("Second"),
377 "Expected 'Second' in extracted text, got: {}",
378 all_text
379 );
380
381 let _ = std::fs::remove_file(&pdf_path);
383 }
384}