1pub mod archive;
2pub mod csv;
3pub mod engine;
4pub mod error;
5pub mod format;
6pub mod image;
7pub mod ir;
8pub mod json;
9pub mod openxml;
10pub mod pdf;
11pub mod render;
12pub mod source;
13pub mod textual;
14
15use std::collections::{HashMap, HashSet};
16use std::fs;
17use std::path::{Path, PathBuf};
18use std::process::Command;
19use std::time::{SystemTime, UNIX_EPOCH};
20
21pub use archive::ArchiveEngine;
22pub use csv::CsvEngine;
23pub use engine::{ExtractionEngine, PlainTextEngine};
24pub use error::{DonglerError, Result};
25pub use format::{ExtractionStatus, InputFormat};
26pub use image::ImageEngine;
27pub use ir::{
28 Asset, BBox, BatchResult, Block, BlockKind, Confidence, Document, ExtractOptions, FigureBlock,
29 ImageObject, Line, Metadata, Page, Provenance, Route, SourceAnchor, Span, TableBlock, TableCell,
30 TextBlock, TextSource, Warning,
31};
32pub use json::JsonEngine;
33pub use openxml::OpenXmlEngine;
34pub use pdf::PdfEngine;
35pub use render::{JsonRenderer, LatexRenderer, MarkdownRenderer, Renderer};
36pub use source::{
37 FormatSourceLoader, ImageSourceLoader, PdfSourceLoader, Source, SourceLoader, TextSourceLoader,
38};
39pub use textual::{EmailEngine, HtmlEngine, XmlEngine};
40
41impl Document {
42 pub fn to_markdown(&self) -> Result<String> {
43 MarkdownRenderer.render(self)
44 }
45
46 pub fn to_json(&self) -> Result<String> {
47 JsonRenderer.render(self)
48 }
49
50 pub fn to_latex(&self) -> Result<String> {
51 LatexRenderer.render(self)
52 }
53}
54
55pub fn parse_text(text: &str) -> Result<Document> {
56 PlainTextEngine.extract(&Source::from_text(text))
57}
58
59pub fn load_path(path: impl AsRef<Path>) -> Result<Document> {
60 load_path_with_options(path, ExtractOptions::default())
61}
62
63pub fn load_path_with_options(path: impl AsRef<Path>, options: ExtractOptions) -> Result<Document> {
64 let path = path.as_ref();
65 let format = InputFormat::detect_path(path)?;
66 if format.extraction_status() == ExtractionStatus::Planned {
67 return Err(DonglerError::planned_format(format.as_str()));
68 }
69
70 let source = load_source(format, path)?;
71 let mut document = engine_extract(format, &source)?;
72
73 if ocr_fallback_enabled() {
74 apply_ocr_fallback(&mut document);
75 }
76 apply_extract_options(&mut document, &options);
77 Ok(document)
78}
79
80fn load_source(format: InputFormat, path: &Path) -> Result<Source> {
82 match format {
83 InputFormat::Text => TextSourceLoader.load(path),
84 InputFormat::Pdf => PdfSourceLoader.load(path),
85 InputFormat::Image => ImageSourceLoader.load(path),
86 _ => FormatSourceLoader::new(format).load(path),
87 }
88}
89
90fn engine_extract(format: InputFormat, source: &Source) -> Result<Document> {
95 match format {
96 InputFormat::Text => PlainTextEngine.extract(source),
97 InputFormat::Pdf => PdfEngine.extract(source),
98 InputFormat::Image => ImageEngine.extract(source),
99 InputFormat::Archive => ArchiveEngine.extract(source),
100 InputFormat::Word
101 | InputFormat::Excel
102 | InputFormat::Presentation
103 | InputFormat::OpenDocument => OpenXmlEngine.extract(source),
104 InputFormat::Html => HtmlEngine.extract(source),
105 InputFormat::Email => EmailEngine.extract(source),
106 InputFormat::Xml => XmlEngine.extract(source),
107 InputFormat::Json => JsonEngine.extract(source),
108 InputFormat::Csv => CsvEngine.extract(source),
109 InputFormat::LegacyWord
110 | InputFormat::LegacyExcel
111 | InputFormat::LegacyPresentation
112 | InputFormat::LegacyEmail => Err(DonglerError::planned_format(format.as_str())),
113 }
114}
115
116pub fn extract_bytes(bytes: &[u8], filename: &str) -> Result<Document> {
123 extract_bytes_with_options(bytes, filename, ExtractOptions::default())
124}
125
126pub fn extract_bytes_with_options(
127 bytes: &[u8],
128 filename: &str,
129 options: ExtractOptions,
130) -> Result<Document> {
131 let format = InputFormat::detect_path(filename)?;
132 if format.extraction_status() == ExtractionStatus::Planned {
133 return Err(DonglerError::planned_format(format.as_str()));
134 }
135
136 let source = Source::from_bytes_for_format(bytes, filename, format)?;
137 let mut document = engine_extract(format, &source)?;
138 apply_extract_options(&mut document, &options);
139 Ok(document)
140}
141
142#[derive(Debug, Clone)]
143struct OcrFallbackConfig {
144 renderer: String,
145 engine: String,
146 temp_dir: PathBuf,
147}
148
149fn ocr_fallback_enabled() -> bool {
150 matches!(
151 std::env::var("DONGLER_OCR_FALLBACK")
152 .unwrap_or_default()
153 .to_ascii_lowercase()
154 .as_str(),
155 "1" | "true" | "yes" | "on"
156 )
157}
158
159fn apply_ocr_fallback(document: &mut Document) {
160 if document.metadata.format != "pdf" {
161 return;
162 }
163 let Some(source_path) = document.metadata.source.as_deref().map(PathBuf::from) else {
164 return;
165 };
166 if !source_path.exists() {
167 return;
168 }
169 let config = ocr_fallback_config();
170 let mut changed = false;
171
172 for page in &mut document.pages {
173 if !page_needs_ocr_fallback(page) {
174 continue;
175 }
176
177 match ocr_pdf_page(&source_path, page.number, &config) {
178 Ok(Some(text)) => {
179 insert_ocr_text_block(page, text);
180 changed = true;
181 }
182 Ok(None) => {}
183 Err(message) => page.warnings.push(Warning {
184 code: "ocr.fallback".to_owned(),
185 severity: "warning".to_owned(),
186 message,
187 source_anchor: Some(SourceAnchor {
188 page_number: page.number,
189 pdf_object_ids: Vec::new(),
190 bbox: page.bbox,
191 extraction_method: "ocr_fallback".to_owned(),
192 }),
193 }),
194 }
195 }
196
197 if changed {
198 refresh_document_counts(document);
199 }
200}
201
202fn ocr_fallback_config() -> OcrFallbackConfig {
203 OcrFallbackConfig {
204 renderer: std::env::var("DONGLER_PDF_RENDERER").unwrap_or_else(|_| "pdftoppm".to_owned()),
205 engine: std::env::var("DONGLER_OCR_ENGINE").unwrap_or_else(|_| "tesseract".to_owned()),
206 temp_dir: std::env::var("DONGLER_OCR_TEMP_DIR")
207 .map(PathBuf::from)
208 .unwrap_or_else(|_| {
209 std::env::current_dir()
210 .unwrap_or_else(|_| std::env::temp_dir())
211 .join("target")
212 .join("dongler-ocr")
213 }),
214 }
215}
216
217fn page_needs_ocr_fallback(page: &Page) -> bool {
218 !page.images.is_empty()
219 && !page.blocks.iter().any(|block| match block {
220 Block::Text(text) => !text.text.trim().is_empty(),
221 Block::Table(table) => {
222 table.headers.iter().any(|value| !value.trim().is_empty())
223 || table
224 .rows
225 .iter()
226 .flatten()
227 .any(|value| !value.trim().is_empty())
228 }
229 Block::Figure(_) => false,
230 })
231}
232
233fn ocr_pdf_page(
234 source_path: &Path,
235 page_number: usize,
236 config: &OcrFallbackConfig,
237) -> std::result::Result<Option<String>, String> {
238 fs::create_dir_all(&config.temp_dir).map_err(|error| {
239 format!(
240 "could not create OCR temp dir {}: {error}",
241 config.temp_dir.display()
242 )
243 })?;
244 let prefix = config.temp_dir.join(format!(
245 "page-{}-{}-{}",
246 std::process::id(),
247 page_number,
248 SystemTime::now()
249 .duration_since(UNIX_EPOCH)
250 .map(|duration| duration.as_nanos())
251 .unwrap_or_default()
252 ));
253 let image_path = prefix.with_extension("png");
254 let page = page_number.to_string();
255 let render_output = Command::new(&config.renderer)
256 .args([
257 "-f",
258 page.as_str(),
259 "-l",
260 page.as_str(),
261 "-r",
262 "200",
263 "-png",
264 "-singlefile",
265 ])
266 .arg(source_path)
267 .arg(&prefix)
268 .output()
269 .map_err(|error| format!("could not run PDF renderer {}: {error}", config.renderer))?;
270
271 if !render_output.status.success() {
272 let stderr = String::from_utf8_lossy(&render_output.stderr);
273 return Err(format!(
274 "PDF renderer {} failed: {}",
275 config.renderer,
276 stderr.trim()
277 ));
278 }
279
280 let ocr_output = Command::new(&config.engine)
281 .arg(&image_path)
282 .arg("stdout")
283 .args(["--psm", "6"])
284 .output()
285 .map_err(|error| format!("could not run OCR engine {}: {error}", config.engine));
286 let _ = fs::remove_file(&image_path);
287
288 let ocr_output = ocr_output?;
289 if !ocr_output.status.success() {
290 let stderr = String::from_utf8_lossy(&ocr_output.stderr);
291 return Err(format!(
292 "OCR engine {} failed: {}",
293 config.engine,
294 stderr.trim()
295 ));
296 }
297
298 let text = normalize_ocr_text(&String::from_utf8_lossy(&ocr_output.stdout));
299 Ok((!text.is_empty()).then_some(text))
300}
301
302fn normalize_ocr_text(text: &str) -> String {
303 text.lines()
304 .map(|line| line.split_whitespace().collect::<Vec<_>>().join(" "))
305 .filter(|line| !line.is_empty())
306 .collect::<Vec<_>>()
307 .join("\n")
308}
309
310fn insert_ocr_text_block(page: &mut Page, text: String) {
311 let bbox = page.bbox;
312 page.blocks.insert(
313 0,
314 Block::Text(TextBlock {
315 text: text.clone(),
316 kind: "ocr_text".to_owned(),
317 bbox,
318 lines: vec![Line {
319 text: text.clone(),
320 bbox,
321 spans: vec![Span {
322 text,
323 bbox,
324 font: None,
325 size: None,
326 bold: false,
327 italic: false,
328 }],
329 }],
330 source_anchors: vec![SourceAnchor {
331 page_number: page.number,
332 pdf_object_ids: Vec::new(),
333 bbox,
334 extraction_method: "ocr_fallback".to_owned(),
335 }],
336 confidence: Some(Confidence {
337 score: 0.55,
338 calibrated: false,
339 }), ..Default::default()
340 }),
341 );
342}
343
344fn apply_extract_options(document: &mut Document, options: &ExtractOptions) {
345 if options.suppress_headers_footers {
346 suppress_repeated_headers_footers(document);
347 }
348
349 if !options.include_geometry {
350 for page in &mut document.pages {
351 page.bbox = None;
352 page.width = None;
353 page.height = None;
354 for block in &mut page.blocks {
355 match block {
356 Block::Text(text) => {
357 text.bbox = None;
358 text.lines.clear();
359 for anchor in &mut text.source_anchors {
360 anchor.bbox = None;
361 }
362 }
363 Block::Table(table) => {
364 table.bbox = None;
365 for cell in &mut table.cells {
366 cell.bbox = None;
367 }
368 for anchor in &mut table.source_anchors {
369 anchor.bbox = None;
370 }
371 }
372 Block::Figure(figure) => {
373 figure.bbox = None;
374 for anchor in &mut figure.source_anchors {
375 anchor.bbox = None;
376 }
377 }
378 }
379 }
380 for image in &mut page.images {
381 image.bbox = None;
382 }
383 for asset in &mut page.assets {
384 asset.bbox = None;
385 }
386 }
387 }
388
389 if !options.include_assets {
390 document.assets.clear();
391 for page in &mut document.pages {
392 page.assets.clear();
393 page.images.clear();
394 }
395 }
396}
397
398fn suppress_repeated_headers_footers(document: &mut Document) {
399 if document.pages.len() < 2 {
400 return;
401 }
402
403 let mut occurrences = HashMap::new();
404 for page in &document.pages {
405 let mut seen_on_page = HashSet::new();
406 for block in &page.blocks {
407 if let Some(key) = header_footer_key(page.height, block) {
408 seen_on_page.insert(key);
409 }
410 }
411 for key in seen_on_page {
412 *occurrences.entry(key).or_insert(0usize) += 1;
413 }
414 }
415
416 let minimum_pages = 2.max((document.pages.len() + 1) / 2);
417 let repeated = occurrences
418 .into_iter()
419 .filter_map(|(key, count)| (count >= minimum_pages).then_some(key))
420 .collect::<HashSet<_>>();
421 if repeated.is_empty() {
422 return;
423 }
424
425 for page in &mut document.pages {
426 let page_height = page.height;
427 page.blocks.retain(|block| {
428 header_footer_key(page_height, block)
429 .map(|key| !repeated.contains(&key))
430 .unwrap_or(true)
431 });
432 }
433 refresh_document_counts(document);
434}
435
436fn header_footer_key(page_height: Option<f32>, block: &Block) -> Option<String> {
437 let height = page_height?;
438 if height <= 0.0 {
439 return None;
440 }
441
442 let bbox = block_bbox(block)?;
443 let center_y = bbox.y + bbox.height / 2.0;
444 let margin = (height * 0.12).max(48.0);
445 let band = if center_y >= height - margin {
446 "top"
447 } else if center_y <= margin {
448 "bottom"
449 } else {
450 return None;
451 };
452
453 let text = normalize_repeated_margin_text(&block_text(block));
454 (!text.is_empty()).then(|| format!("{band}:{text}"))
455}
456
457fn block_bbox(block: &Block) -> Option<BBox> {
458 match block {
459 Block::Text(text) => text.bbox,
460 Block::Table(table) => table.bbox,
461 Block::Figure(figure) => figure.bbox,
462 }
463}
464
465fn normalize_repeated_margin_text(text: &str) -> String {
466 let mut output = String::new();
467 let mut last_was_space = true;
468 for character in text.chars().flat_map(char::to_lowercase) {
469 if character.is_ascii_digit() {
470 if !output.ends_with('#') {
471 output.push('#');
472 }
473 last_was_space = false;
474 } else if character.is_whitespace() {
475 if !last_was_space {
476 output.push(' ');
477 last_was_space = true;
478 }
479 } else {
480 output.push(character);
481 last_was_space = false;
482 }
483 }
484 output.trim().to_owned()
485}
486
487fn refresh_document_counts(document: &mut Document) {
488 let mut character_count = 0;
489 let mut word_count = 0;
490 let mut block_count = 0;
491
492 for page in &document.pages {
493 for block in &page.blocks {
494 let text = block_text(block);
495 character_count += text.chars().count();
496 word_count += text.split_whitespace().count();
497 block_count += 1;
498 }
499 }
500
501 document.metadata.character_count = character_count;
502 document.metadata.word_count = word_count;
503 document.metadata.block_count = block_count;
504}
505
506fn block_text(block: &Block) -> String {
507 match block {
508 Block::Text(text) => text.text.clone(),
509 Block::Table(table) => {
510 let mut rows = Vec::new();
511 if !table.headers.is_empty() {
512 rows.push(table.headers.join(" "));
513 }
514 rows.extend(table.rows.iter().map(|row| row.join(" ")));
515 rows.join("\n")
516 }
517 Block::Figure(figure) => figure.caption.clone().unwrap_or_default(),
518 }
519}
520
521pub fn load_many<I, P>(paths: I) -> Vec<BatchResult>
522where
523 I: IntoIterator<Item = P>,
524 P: AsRef<Path>,
525{
526 paths
527 .into_iter()
528 .map(|path| {
529 let path = path.as_ref();
530 let path_string = path.display().to_string();
531
532 match load_path(path) {
533 Ok(document) => BatchResult {
534 path: path_string,
535 ok: true,
536 document: Some(document),
537 error: None,
538 },
539 Err(error) => BatchResult {
540 path: path_string,
541 ok: false,
542 document: None,
543 error: Some(error.to_string()),
544 },
545 }
546 })
547 .collect()
548}
549
550pub fn to_markdown(text: &str) -> Result<String> {
551 let document = parse_text(text)?;
552 document.to_markdown()
553}
554
555pub fn to_json(text: &str) -> Result<String> {
556 let document = parse_text(text)?;
557 document.to_json()
558}
559
560pub fn to_latex(text: &str) -> Result<String> {
561 let document = parse_text(text)?;
562 document.to_latex()
563}
564
565pub fn detect_format(path: &str) -> Result<String> {
566 Ok(InputFormat::detect_path(path)?.as_str().to_owned())
567}