Skip to main content

dongler_core/
lib.rs

1pub mod archive;
2pub mod csv;
3pub mod engine;
4pub mod error;
5pub mod format;
6pub mod image;
7pub mod ir;
8pub mod json;
9pub mod openxml;
10pub mod pdf;
11pub mod render;
12pub mod source;
13pub mod textual;
14
15use std::collections::{HashMap, HashSet};
16use std::fs;
17use std::path::{Path, PathBuf};
18use std::process::Command;
19use std::time::{SystemTime, UNIX_EPOCH};
20
21pub use archive::ArchiveEngine;
22pub use csv::CsvEngine;
23pub use engine::{ExtractionEngine, PlainTextEngine};
24pub use error::{DonglerError, Result};
25pub use format::{ExtractionStatus, InputFormat};
26pub use image::ImageEngine;
27pub use ir::{
28    Asset, BBox, BatchResult, Block, BlockKind, Confidence, Document, ExtractOptions, FigureBlock,
29    ImageObject, Line, Metadata, Page, Provenance, Route, SourceAnchor, Span, TableBlock, TableCell,
30    TextBlock, TextSource, Warning,
31};
32pub use json::JsonEngine;
33pub use openxml::OpenXmlEngine;
34pub use pdf::PdfEngine;
35pub use render::{JsonRenderer, LatexRenderer, MarkdownRenderer, Renderer};
36pub use source::{
37    FormatSourceLoader, ImageSourceLoader, PdfSourceLoader, Source, SourceLoader, TextSourceLoader,
38};
39pub use textual::{EmailEngine, HtmlEngine, XmlEngine};
40
41impl Document {
42    pub fn to_markdown(&self) -> Result<String> {
43        MarkdownRenderer.render(self)
44    }
45
46    pub fn to_json(&self) -> Result<String> {
47        JsonRenderer.render(self)
48    }
49
50    pub fn to_latex(&self) -> Result<String> {
51        LatexRenderer.render(self)
52    }
53}
54
55pub fn parse_text(text: &str) -> Result<Document> {
56    PlainTextEngine.extract(&Source::from_text(text))
57}
58
59pub fn load_path(path: impl AsRef<Path>) -> Result<Document> {
60    load_path_with_options(path, ExtractOptions::default())
61}
62
63pub fn load_path_with_options(path: impl AsRef<Path>, options: ExtractOptions) -> Result<Document> {
64    let path = path.as_ref();
65    let format = InputFormat::detect_path(path)?;
66    if format.extraction_status() == ExtractionStatus::Planned {
67        return Err(DonglerError::planned_format(format.as_str()));
68    }
69
70    let source = load_source(format, path)?;
71    let mut document = engine_extract(format, &source)?;
72
73    if ocr_fallback_enabled() {
74        apply_ocr_fallback(&mut document);
75    }
76    apply_extract_options(&mut document, &options);
77    Ok(document)
78}
79
80/// Read a source from disk using the loader appropriate for `format`.
81fn load_source(format: InputFormat, path: &Path) -> Result<Source> {
82    match format {
83        InputFormat::Text => TextSourceLoader.load(path),
84        InputFormat::Pdf => PdfSourceLoader.load(path),
85        InputFormat::Image => ImageSourceLoader.load(path),
86        _ => FormatSourceLoader::new(format).load(path),
87    }
88}
89
90/// Dispatch an in-memory source to the engine for `format`.
91///
92/// This is the filesystem-free heart of extraction, shared by the path-based
93/// loaders and the byte-based [`extract_bytes`] entry point used by wasm.
94fn engine_extract(format: InputFormat, source: &Source) -> Result<Document> {
95    match format {
96        InputFormat::Text => PlainTextEngine.extract(source),
97        InputFormat::Pdf => PdfEngine.extract(source),
98        InputFormat::Image => ImageEngine.extract(source),
99        InputFormat::Archive => ArchiveEngine.extract(source),
100        InputFormat::Word
101        | InputFormat::Excel
102        | InputFormat::Presentation
103        | InputFormat::OpenDocument => OpenXmlEngine.extract(source),
104        InputFormat::Html => HtmlEngine.extract(source),
105        InputFormat::Email => EmailEngine.extract(source),
106        InputFormat::Xml => XmlEngine.extract(source),
107        InputFormat::Json => JsonEngine.extract(source),
108        InputFormat::Csv => CsvEngine.extract(source),
109        InputFormat::LegacyWord
110        | InputFormat::LegacyExcel
111        | InputFormat::LegacyPresentation
112        | InputFormat::LegacyEmail => Err(DonglerError::planned_format(format.as_str())),
113    }
114}
115
116/// Extract a document from in-memory bytes, detecting the format from
117/// `filename` (its extension only — the file is never read from disk).
118///
119/// This is the primary entry point for environments without a filesystem,
120/// such as WebAssembly. OCR fallback is intentionally not applied here because
121/// it relies on spawning external processes.
122pub fn extract_bytes(bytes: &[u8], filename: &str) -> Result<Document> {
123    extract_bytes_with_options(bytes, filename, ExtractOptions::default())
124}
125
126pub fn extract_bytes_with_options(
127    bytes: &[u8],
128    filename: &str,
129    options: ExtractOptions,
130) -> Result<Document> {
131    let format = InputFormat::detect_path(filename)?;
132    if format.extraction_status() == ExtractionStatus::Planned {
133        return Err(DonglerError::planned_format(format.as_str()));
134    }
135
136    let source = Source::from_bytes_for_format(bytes, filename, format)?;
137    let mut document = engine_extract(format, &source)?;
138    apply_extract_options(&mut document, &options);
139    Ok(document)
140}
141
142#[derive(Debug, Clone)]
143struct OcrFallbackConfig {
144    renderer: String,
145    engine: String,
146    temp_dir: PathBuf,
147}
148
149fn ocr_fallback_enabled() -> bool {
150    matches!(
151        std::env::var("DONGLER_OCR_FALLBACK")
152            .unwrap_or_default()
153            .to_ascii_lowercase()
154            .as_str(),
155        "1" | "true" | "yes" | "on"
156    )
157}
158
159fn apply_ocr_fallback(document: &mut Document) {
160    if document.metadata.format != "pdf" {
161        return;
162    }
163    let Some(source_path) = document.metadata.source.as_deref().map(PathBuf::from) else {
164        return;
165    };
166    if !source_path.exists() {
167        return;
168    }
169    let config = ocr_fallback_config();
170    let mut changed = false;
171
172    for page in &mut document.pages {
173        if !page_needs_ocr_fallback(page) {
174            continue;
175        }
176
177        match ocr_pdf_page(&source_path, page.number, &config) {
178            Ok(Some(text)) => {
179                insert_ocr_text_block(page, text);
180                changed = true;
181            }
182            Ok(None) => {}
183            Err(message) => page.warnings.push(Warning {
184                code: "ocr.fallback".to_owned(),
185                severity: "warning".to_owned(),
186                message,
187                source_anchor: Some(SourceAnchor {
188                    page_number: page.number,
189                    pdf_object_ids: Vec::new(),
190                    bbox: page.bbox,
191                    extraction_method: "ocr_fallback".to_owned(),
192                }),
193            }),
194        }
195    }
196
197    if changed {
198        refresh_document_counts(document);
199    }
200}
201
202fn ocr_fallback_config() -> OcrFallbackConfig {
203    OcrFallbackConfig {
204        renderer: std::env::var("DONGLER_PDF_RENDERER").unwrap_or_else(|_| "pdftoppm".to_owned()),
205        engine: std::env::var("DONGLER_OCR_ENGINE").unwrap_or_else(|_| "tesseract".to_owned()),
206        temp_dir: std::env::var("DONGLER_OCR_TEMP_DIR")
207            .map(PathBuf::from)
208            .unwrap_or_else(|_| {
209                std::env::current_dir()
210                    .unwrap_or_else(|_| std::env::temp_dir())
211                    .join("target")
212                    .join("dongler-ocr")
213            }),
214    }
215}
216
217fn page_needs_ocr_fallback(page: &Page) -> bool {
218    !page.images.is_empty()
219        && !page.blocks.iter().any(|block| match block {
220            Block::Text(text) => !text.text.trim().is_empty(),
221            Block::Table(table) => {
222                table.headers.iter().any(|value| !value.trim().is_empty())
223                    || table
224                        .rows
225                        .iter()
226                        .flatten()
227                        .any(|value| !value.trim().is_empty())
228            }
229            Block::Figure(_) => false,
230        })
231}
232
233fn ocr_pdf_page(
234    source_path: &Path,
235    page_number: usize,
236    config: &OcrFallbackConfig,
237) -> std::result::Result<Option<String>, String> {
238    fs::create_dir_all(&config.temp_dir).map_err(|error| {
239        format!(
240            "could not create OCR temp dir {}: {error}",
241            config.temp_dir.display()
242        )
243    })?;
244    let prefix = config.temp_dir.join(format!(
245        "page-{}-{}-{}",
246        std::process::id(),
247        page_number,
248        SystemTime::now()
249            .duration_since(UNIX_EPOCH)
250            .map(|duration| duration.as_nanos())
251            .unwrap_or_default()
252    ));
253    let image_path = prefix.with_extension("png");
254    let page = page_number.to_string();
255    let render_output = Command::new(&config.renderer)
256        .args([
257            "-f",
258            page.as_str(),
259            "-l",
260            page.as_str(),
261            "-r",
262            "200",
263            "-png",
264            "-singlefile",
265        ])
266        .arg(source_path)
267        .arg(&prefix)
268        .output()
269        .map_err(|error| format!("could not run PDF renderer {}: {error}", config.renderer))?;
270
271    if !render_output.status.success() {
272        let stderr = String::from_utf8_lossy(&render_output.stderr);
273        return Err(format!(
274            "PDF renderer {} failed: {}",
275            config.renderer,
276            stderr.trim()
277        ));
278    }
279
280    let ocr_output = Command::new(&config.engine)
281        .arg(&image_path)
282        .arg("stdout")
283        .args(["--psm", "6"])
284        .output()
285        .map_err(|error| format!("could not run OCR engine {}: {error}", config.engine));
286    let _ = fs::remove_file(&image_path);
287
288    let ocr_output = ocr_output?;
289    if !ocr_output.status.success() {
290        let stderr = String::from_utf8_lossy(&ocr_output.stderr);
291        return Err(format!(
292            "OCR engine {} failed: {}",
293            config.engine,
294            stderr.trim()
295        ));
296    }
297
298    let text = normalize_ocr_text(&String::from_utf8_lossy(&ocr_output.stdout));
299    Ok((!text.is_empty()).then_some(text))
300}
301
302fn normalize_ocr_text(text: &str) -> String {
303    text.lines()
304        .map(|line| line.split_whitespace().collect::<Vec<_>>().join(" "))
305        .filter(|line| !line.is_empty())
306        .collect::<Vec<_>>()
307        .join("\n")
308}
309
310fn insert_ocr_text_block(page: &mut Page, text: String) {
311    let bbox = page.bbox;
312    page.blocks.insert(
313        0,
314        Block::Text(TextBlock {
315            text: text.clone(),
316            kind: "ocr_text".to_owned(),
317            bbox,
318            lines: vec![Line {
319                text: text.clone(),
320                bbox,
321                spans: vec![Span {
322                    text,
323                    bbox,
324                    font: None,
325                    size: None,
326                    bold: false,
327                    italic: false,
328                }],
329            }],
330            source_anchors: vec![SourceAnchor {
331                page_number: page.number,
332                pdf_object_ids: Vec::new(),
333                bbox,
334                extraction_method: "ocr_fallback".to_owned(),
335            }],
336            confidence: Some(Confidence {
337                score: 0.55,
338                calibrated: false,
339            }), ..Default::default()
340        }),
341    );
342}
343
344fn apply_extract_options(document: &mut Document, options: &ExtractOptions) {
345    if options.suppress_headers_footers {
346        suppress_repeated_headers_footers(document);
347    }
348
349    if !options.include_geometry {
350        for page in &mut document.pages {
351            page.bbox = None;
352            page.width = None;
353            page.height = None;
354            for block in &mut page.blocks {
355                match block {
356                    Block::Text(text) => {
357                        text.bbox = None;
358                        text.lines.clear();
359                        for anchor in &mut text.source_anchors {
360                            anchor.bbox = None;
361                        }
362                    }
363                    Block::Table(table) => {
364                        table.bbox = None;
365                        for cell in &mut table.cells {
366                            cell.bbox = None;
367                        }
368                        for anchor in &mut table.source_anchors {
369                            anchor.bbox = None;
370                        }
371                    }
372                    Block::Figure(figure) => {
373                        figure.bbox = None;
374                        for anchor in &mut figure.source_anchors {
375                            anchor.bbox = None;
376                        }
377                    }
378                }
379            }
380            for image in &mut page.images {
381                image.bbox = None;
382            }
383            for asset in &mut page.assets {
384                asset.bbox = None;
385            }
386        }
387    }
388
389    if !options.include_assets {
390        document.assets.clear();
391        for page in &mut document.pages {
392            page.assets.clear();
393            page.images.clear();
394        }
395    }
396}
397
398fn suppress_repeated_headers_footers(document: &mut Document) {
399    if document.pages.len() < 2 {
400        return;
401    }
402
403    let mut occurrences = HashMap::new();
404    for page in &document.pages {
405        let mut seen_on_page = HashSet::new();
406        for block in &page.blocks {
407            if let Some(key) = header_footer_key(page.height, block) {
408                seen_on_page.insert(key);
409            }
410        }
411        for key in seen_on_page {
412            *occurrences.entry(key).or_insert(0usize) += 1;
413        }
414    }
415
416    let minimum_pages = 2.max((document.pages.len() + 1) / 2);
417    let repeated = occurrences
418        .into_iter()
419        .filter_map(|(key, count)| (count >= minimum_pages).then_some(key))
420        .collect::<HashSet<_>>();
421    if repeated.is_empty() {
422        return;
423    }
424
425    for page in &mut document.pages {
426        let page_height = page.height;
427        page.blocks.retain(|block| {
428            header_footer_key(page_height, block)
429                .map(|key| !repeated.contains(&key))
430                .unwrap_or(true)
431        });
432    }
433    refresh_document_counts(document);
434}
435
436fn header_footer_key(page_height: Option<f32>, block: &Block) -> Option<String> {
437    let height = page_height?;
438    if height <= 0.0 {
439        return None;
440    }
441
442    let bbox = block_bbox(block)?;
443    let center_y = bbox.y + bbox.height / 2.0;
444    let margin = (height * 0.12).max(48.0);
445    let band = if center_y >= height - margin {
446        "top"
447    } else if center_y <= margin {
448        "bottom"
449    } else {
450        return None;
451    };
452
453    let text = normalize_repeated_margin_text(&block_text(block));
454    (!text.is_empty()).then(|| format!("{band}:{text}"))
455}
456
457fn block_bbox(block: &Block) -> Option<BBox> {
458    match block {
459        Block::Text(text) => text.bbox,
460        Block::Table(table) => table.bbox,
461        Block::Figure(figure) => figure.bbox,
462    }
463}
464
465fn normalize_repeated_margin_text(text: &str) -> String {
466    let mut output = String::new();
467    let mut last_was_space = true;
468    for character in text.chars().flat_map(char::to_lowercase) {
469        if character.is_ascii_digit() {
470            if !output.ends_with('#') {
471                output.push('#');
472            }
473            last_was_space = false;
474        } else if character.is_whitespace() {
475            if !last_was_space {
476                output.push(' ');
477                last_was_space = true;
478            }
479        } else {
480            output.push(character);
481            last_was_space = false;
482        }
483    }
484    output.trim().to_owned()
485}
486
487fn refresh_document_counts(document: &mut Document) {
488    let mut character_count = 0;
489    let mut word_count = 0;
490    let mut block_count = 0;
491
492    for page in &document.pages {
493        for block in &page.blocks {
494            let text = block_text(block);
495            character_count += text.chars().count();
496            word_count += text.split_whitespace().count();
497            block_count += 1;
498        }
499    }
500
501    document.metadata.character_count = character_count;
502    document.metadata.word_count = word_count;
503    document.metadata.block_count = block_count;
504}
505
506fn block_text(block: &Block) -> String {
507    match block {
508        Block::Text(text) => text.text.clone(),
509        Block::Table(table) => {
510            let mut rows = Vec::new();
511            if !table.headers.is_empty() {
512                rows.push(table.headers.join(" "));
513            }
514            rows.extend(table.rows.iter().map(|row| row.join(" ")));
515            rows.join("\n")
516        }
517        Block::Figure(figure) => figure.caption.clone().unwrap_or_default(),
518    }
519}
520
521pub fn load_many<I, P>(paths: I) -> Vec<BatchResult>
522where
523    I: IntoIterator<Item = P>,
524    P: AsRef<Path>,
525{
526    paths
527        .into_iter()
528        .map(|path| {
529            let path = path.as_ref();
530            let path_string = path.display().to_string();
531
532            match load_path(path) {
533                Ok(document) => BatchResult {
534                    path: path_string,
535                    ok: true,
536                    document: Some(document),
537                    error: None,
538                },
539                Err(error) => BatchResult {
540                    path: path_string,
541                    ok: false,
542                    document: None,
543                    error: Some(error.to_string()),
544                },
545            }
546        })
547        .collect()
548}
549
550pub fn to_markdown(text: &str) -> Result<String> {
551    let document = parse_text(text)?;
552    document.to_markdown()
553}
554
555pub fn to_json(text: &str) -> Result<String> {
556    let document = parse_text(text)?;
557    document.to_json()
558}
559
560pub fn to_latex(text: &str) -> Result<String> {
561    let document = parse_text(text)?;
562    document.to_latex()
563}
564
565pub fn detect_format(path: &str) -> Result<String> {
566    Ok(InputFormat::detect_path(path)?.as_str().to_owned())
567}