dongler-core 0.3.1

pub mod archive;
pub mod csv;
pub mod engine;
pub mod error;
pub mod format;
pub mod image;
pub mod ir;
pub mod json;
pub mod openxml;
pub mod pdf;
pub mod render;
pub mod source;
pub mod textual;

use std::collections::{HashMap, HashSet};
use std::fs;
use std::path::{Path, PathBuf};
use std::process::Command;
use std::time::{SystemTime, UNIX_EPOCH};

pub use archive::ArchiveEngine;
pub use csv::CsvEngine;
pub use engine::{ExtractionEngine, PlainTextEngine};
pub use error::{DonglerError, Result};
pub use format::{ExtractionStatus, InputFormat};
pub use image::ImageEngine;
pub use ir::{
    Asset, BBox, BatchResult, Block, Confidence, Document, ExtractOptions, FigureBlock,
    ImageObject, Line, Metadata, Page, SourceAnchor, Span, TableBlock, TableCell, TextBlock,
    Warning,
};
pub use json::JsonEngine;
pub use openxml::OpenXmlEngine;
pub use pdf::PdfEngine;
pub use render::{JsonRenderer, LatexRenderer, MarkdownRenderer, Renderer};
pub use source::{
    FormatSourceLoader, ImageSourceLoader, PdfSourceLoader, Source, SourceLoader, TextSourceLoader,
};
pub use textual::{EmailEngine, HtmlEngine, XmlEngine};

impl Document {
    pub fn to_markdown(&self) -> Result<String> {
        MarkdownRenderer.render(self)
    }

    pub fn to_json(&self) -> Result<String> {
        JsonRenderer.render(self)
    }

    pub fn to_latex(&self) -> Result<String> {
        LatexRenderer.render(self)
    }
}

pub fn parse_text(text: &str) -> Result<Document> {
    PlainTextEngine.extract(&Source::from_text(text))
}

pub fn load_path(path: impl AsRef<Path>) -> Result<Document> {
    load_path_with_options(path, ExtractOptions::default())
}

pub fn load_path_with_options(path: impl AsRef<Path>, options: ExtractOptions) -> Result<Document> {
    let path = path.as_ref();
    let format = InputFormat::detect_path(path)?;

    let mut document = match format {
        InputFormat::Text => {
            let source = TextSourceLoader.load(path)?;
            PlainTextEngine.extract(&source)
        }
        InputFormat::Pdf => {
            let source = PdfSourceLoader.load(path)?;
            PdfEngine.extract(&source)
        }
        InputFormat::Image => {
            let source = ImageSourceLoader.load(path)?;
            ImageEngine.extract(&source)
        }
        InputFormat::Archive => {
            let source = FormatSourceLoader::new(format).load(path)?;
            ArchiveEngine.extract(&source)
        }
        InputFormat::Word
        | InputFormat::Excel
        | InputFormat::Presentation
        | InputFormat::OpenDocument => {
            let source = FormatSourceLoader::new(format).load(path)?;
            OpenXmlEngine.extract(&source)
        }
        InputFormat::Html => {
            let source = FormatSourceLoader::new(format).load(path)?;
            HtmlEngine.extract(&source)
        }
        InputFormat::Email => {
            let source = FormatSourceLoader::new(format).load(path)?;
            EmailEngine.extract(&source)
        }
        InputFormat::Xml => {
            let source = FormatSourceLoader::new(format).load(path)?;
            XmlEngine.extract(&source)
        }
        InputFormat::Json => {
            let source = FormatSourceLoader::new(format).load(path)?;
            JsonEngine.extract(&source)
        }
        InputFormat::Csv => {
            let source = FormatSourceLoader::new(format).load(path)?;
            CsvEngine.extract(&source)
        }
        InputFormat::LegacyWord
        | InputFormat::LegacyExcel
        | InputFormat::LegacyPresentation
        | InputFormat::LegacyEmail => Err(DonglerError::planned_format(format.as_str())),
    }?;

    if ocr_fallback_enabled() {
        apply_ocr_fallback(&mut document);
    }
    apply_extract_options(&mut document, &options);
    Ok(document)
}

#[derive(Debug, Clone)]
struct OcrFallbackConfig {
    renderer: String,
    engine: String,
    temp_dir: PathBuf,
}

fn ocr_fallback_enabled() -> bool {
    matches!(
        std::env::var("DONGLER_OCR_FALLBACK")
            .unwrap_or_default()
            .to_ascii_lowercase()
            .as_str(),
        "1" | "true" | "yes" | "on"
    )
}

fn apply_ocr_fallback(document: &mut Document) {
    if document.metadata.format != "pdf" {
        return;
    }
    let Some(source_path) = document.metadata.source.as_deref().map(PathBuf::from) else {
        return;
    };
    if !source_path.exists() {
        return;
    }
    let config = ocr_fallback_config();
    let mut changed = false;

    for page in &mut document.pages {
        if !page_needs_ocr_fallback(page) {
            continue;
        }

        match ocr_pdf_page(&source_path, page.number, &config) {
            Ok(Some(text)) => {
                insert_ocr_text_block(page, text);
                changed = true;
            }
            Ok(None) => {}
            Err(message) => page.warnings.push(Warning {
                code: "ocr.fallback".to_owned(),
                severity: "warning".to_owned(),
                message,
                source_anchor: Some(SourceAnchor {
                    page_number: page.number,
                    pdf_object_ids: Vec::new(),
                    bbox: page.bbox,
                    extraction_method: "ocr_fallback".to_owned(),
                }),
            }),
        }
    }

    if changed {
        refresh_document_counts(document);
    }
}

fn ocr_fallback_config() -> OcrFallbackConfig {
    OcrFallbackConfig {
        renderer: std::env::var("DONGLER_PDF_RENDERER").unwrap_or_else(|_| "pdftoppm".to_owned()),
        engine: std::env::var("DONGLER_OCR_ENGINE").unwrap_or_else(|_| "tesseract".to_owned()),
        temp_dir: std::env::var("DONGLER_OCR_TEMP_DIR")
            .map(PathBuf::from)
            .unwrap_or_else(|_| {
                std::env::current_dir()
                    .unwrap_or_else(|_| std::env::temp_dir())
                    .join("target")
                    .join("dongler-ocr")
            }),
    }
}

fn page_needs_ocr_fallback(page: &Page) -> bool {
    !page.images.is_empty()
        && !page.blocks.iter().any(|block| match block {
            Block::Text(text) => !text.text.trim().is_empty(),
            Block::Table(table) => {
                table.headers.iter().any(|value| !value.trim().is_empty())
                    || table
                        .rows
                        .iter()
                        .flatten()
                        .any(|value| !value.trim().is_empty())
            }
            Block::Figure(_) => false,
        })
}

fn ocr_pdf_page(
    source_path: &Path,
    page_number: usize,
    config: &OcrFallbackConfig,
) -> std::result::Result<Option<String>, String> {
    fs::create_dir_all(&config.temp_dir).map_err(|error| {
        format!(
            "could not create OCR temp dir {}: {error}",
            config.temp_dir.display()
        )
    })?;
    let prefix = config.temp_dir.join(format!(
        "page-{}-{}-{}",
        std::process::id(),
        page_number,
        SystemTime::now()
            .duration_since(UNIX_EPOCH)
            .map(|duration| duration.as_nanos())
            .unwrap_or_default()
    ));
    let image_path = prefix.with_extension("png");
    let page = page_number.to_string();
    let render_output = Command::new(&config.renderer)
        .args([
            "-f",
            page.as_str(),
            "-l",
            page.as_str(),
            "-r",
            "200",
            "-png",
            "-singlefile",
        ])
        .arg(source_path)
        .arg(&prefix)
        .output()
        .map_err(|error| format!("could not run PDF renderer {}: {error}", config.renderer))?;

    if !render_output.status.success() {
        let stderr = String::from_utf8_lossy(&render_output.stderr);
        return Err(format!(
            "PDF renderer {} failed: {}",
            config.renderer,
            stderr.trim()
        ));
    }

    let ocr_output = Command::new(&config.engine)
        .arg(&image_path)
        .arg("stdout")
        .args(["--psm", "6"])
        .output()
        .map_err(|error| format!("could not run OCR engine {}: {error}", config.engine));
    let _ = fs::remove_file(&image_path);

    let ocr_output = ocr_output?;
    if !ocr_output.status.success() {
        let stderr = String::from_utf8_lossy(&ocr_output.stderr);
        return Err(format!(
            "OCR engine {} failed: {}",
            config.engine,
            stderr.trim()
        ));
    }

    let text = normalize_ocr_text(&String::from_utf8_lossy(&ocr_output.stdout));
    Ok((!text.is_empty()).then_some(text))
}

fn normalize_ocr_text(text: &str) -> String {
    text.lines()
        .map(|line| line.split_whitespace().collect::<Vec<_>>().join(" "))
        .filter(|line| !line.is_empty())
        .collect::<Vec<_>>()
        .join("\n")
}

fn insert_ocr_text_block(page: &mut Page, text: String) {
    let bbox = page.bbox;
    page.blocks.insert(
        0,
        Block::Text(TextBlock {
            text: text.clone(),
            kind: "ocr_text".to_owned(),
            bbox,
            lines: vec![Line {
                text: text.clone(),
                bbox,
                spans: vec![Span {
                    text,
                    bbox,
                    font: None,
                    size: None,
                }],
            }],
            source_anchors: vec![SourceAnchor {
                page_number: page.number,
                pdf_object_ids: Vec::new(),
                bbox,
                extraction_method: "ocr_fallback".to_owned(),
            }],
            confidence: Some(Confidence {
                score: 0.55,
                calibrated: false,
            }),
        }),
    );
}

fn apply_extract_options(document: &mut Document, options: &ExtractOptions) {
    if options.suppress_headers_footers {
        suppress_repeated_headers_footers(document);
    }

    if !options.include_geometry {
        for page in &mut document.pages {
            page.bbox = None;
            page.width = None;
            page.height = None;
            for block in &mut page.blocks {
                match block {
                    Block::Text(text) => {
                        text.bbox = None;
                        text.lines.clear();
                        for anchor in &mut text.source_anchors {
                            anchor.bbox = None;
                        }
                    }
                    Block::Table(table) => {
                        table.bbox = None;
                        for cell in &mut table.cells {
                            cell.bbox = None;
                        }
                        for anchor in &mut table.source_anchors {
                            anchor.bbox = None;
                        }
                    }
                    Block::Figure(figure) => {
                        figure.bbox = None;
                        for anchor in &mut figure.source_anchors {
                            anchor.bbox = None;
                        }
                    }
                }
            }
            for image in &mut page.images {
                image.bbox = None;
            }
            for asset in &mut page.assets {
                asset.bbox = None;
            }
        }
    }

    if !options.include_assets {
        document.assets.clear();
        for page in &mut document.pages {
            page.assets.clear();
            page.images.clear();
        }
    }
}

fn suppress_repeated_headers_footers(document: &mut Document) {
    if document.pages.len() < 2 {
        return;
    }

    let mut occurrences = HashMap::new();
    for page in &document.pages {
        let mut seen_on_page = HashSet::new();
        for block in &page.blocks {
            if let Some(key) = header_footer_key(page.height, block) {
                seen_on_page.insert(key);
            }
        }
        for key in seen_on_page {
            *occurrences.entry(key).or_insert(0usize) += 1;
        }
    }

    let minimum_pages = 2.max((document.pages.len() + 1) / 2);
    let repeated = occurrences
        .into_iter()
        .filter_map(|(key, count)| (count >= minimum_pages).then_some(key))
        .collect::<HashSet<_>>();
    if repeated.is_empty() {
        return;
    }

    for page in &mut document.pages {
        let page_height = page.height;
        page.blocks.retain(|block| {
            header_footer_key(page_height, block)
                .map(|key| !repeated.contains(&key))
                .unwrap_or(true)
        });
    }
    refresh_document_counts(document);
}

fn header_footer_key(page_height: Option<f32>, block: &Block) -> Option<String> {
    let height = page_height?;
    if height <= 0.0 {
        return None;
    }

    let bbox = block_bbox(block)?;
    let center_y = bbox.y + bbox.height / 2.0;
    let margin = (height * 0.12).max(48.0);
    let band = if center_y >= height - margin {
        "top"
    } else if center_y <= margin {
        "bottom"
    } else {
        return None;
    };

    let text = normalize_repeated_margin_text(&block_text(block));
    (!text.is_empty()).then(|| format!("{band}:{text}"))
}

fn block_bbox(block: &Block) -> Option<BBox> {
    match block {
        Block::Text(text) => text.bbox,
        Block::Table(table) => table.bbox,
        Block::Figure(figure) => figure.bbox,
    }
}

fn normalize_repeated_margin_text(text: &str) -> String {
    let mut output = String::new();
    let mut last_was_space = true;
    for character in text.chars().flat_map(char::to_lowercase) {
        if character.is_ascii_digit() {
            if !output.ends_with('#') {
                output.push('#');
            }
            last_was_space = false;
        } else if character.is_whitespace() {
            if !last_was_space {
                output.push(' ');
                last_was_space = true;
            }
        } else {
            output.push(character);
            last_was_space = false;
        }
    }
    output.trim().to_owned()
}

fn refresh_document_counts(document: &mut Document) {
    let mut character_count = 0;
    let mut word_count = 0;
    let mut block_count = 0;

    for page in &document.pages {
        for block in &page.blocks {
            let text = block_text(block);
            character_count += text.chars().count();
            word_count += text.split_whitespace().count();
            block_count += 1;
        }
    }

    document.metadata.character_count = character_count;
    document.metadata.word_count = word_count;
    document.metadata.block_count = block_count;
}

fn block_text(block: &Block) -> String {
    match block {
        Block::Text(text) => text.text.clone(),
        Block::Table(table) => {
            let mut rows = Vec::new();
            if !table.headers.is_empty() {
                rows.push(table.headers.join(" "));
            }
            rows.extend(table.rows.iter().map(|row| row.join(" ")));
            rows.join("\n")
        }
        Block::Figure(figure) => figure.caption.clone().unwrap_or_default(),
    }
}

pub fn load_many<I, P>(paths: I) -> Vec<BatchResult>
where
    I: IntoIterator<Item = P>,
    P: AsRef<Path>,
{
    paths
        .into_iter()
        .map(|path| {
            let path = path.as_ref();
            let path_string = path.display().to_string();

            match load_path(path) {
                Ok(document) => BatchResult {
                    path: path_string,
                    ok: true,
                    document: Some(document),
                    error: None,
                },
                Err(error) => BatchResult {
                    path: path_string,
                    ok: false,
                    document: None,
                    error: Some(error.to_string()),
                },
            }
        })
        .collect()
}

pub fn to_markdown(text: &str) -> Result<String> {
    let document = parse_text(text)?;
    document.to_markdown()
}

pub fn to_json(text: &str) -> Result<String> {
    let document = parse_text(text)?;
    document.to_json()
}

pub fn to_latex(text: &str) -> Result<String> {
    let document = parse_text(text)?;
    document.to_latex()
}

pub fn detect_format(path: &str) -> Result<String> {
    Ok(InputFormat::detect_path(path)?.as_str().to_owned())
}