dongler-core 0.2.0

Created by Daniel Fat. Rust-native document extraction core for structured Markdown and LaTeX output.
Documentation
use crate::error::Result;
use crate::ir::{Block, Document, Metadata, Page, TextBlock, SCHEMA_VERSION};
use crate::source::Source;

pub trait ExtractionEngine {
    fn name(&self) -> &'static str;
    fn extract(&self, source: &Source) -> Result<Document>;
}

#[derive(Debug, Default, Clone, Copy)]
pub struct PlainTextEngine;

impl ExtractionEngine for PlainTextEngine {
    fn name(&self) -> &'static str {
        "plain-text"
    }

    fn extract(&self, source: &Source) -> Result<Document> {
        let paragraphs = split_paragraphs(&source.content);
        let blocks = paragraphs
            .into_iter()
            .map(|text| {
                Block::Text(TextBlock {
                    text,
                    kind: "paragraph".to_owned(),
                    bbox: None,
                    lines: Vec::new(),
                    source_anchors: Vec::new(),
                    confidence: None,
                })
            })
            .collect::<Vec<_>>();

        Ok(Document {
            schema_version: SCHEMA_VERSION.to_owned(),
            metadata: Metadata {
                format: source.format.clone(),
                engine: self.name().to_owned(),
                source: source.path.clone(),
                title: None,
                character_count: source.content.chars().count(),
                word_count: source.content.split_whitespace().count(),
                block_count: blocks.len(),
                file_size_bytes: source.bytes.as_ref().map(|bytes| bytes.len() as u64),
                pdf_version: None,
                encrypted: false,
            },
            pages: vec![Page {
                number: 1,
                width: None,
                height: None,
                rotation: None,
                bbox: None,
                blocks,
                images: Vec::new(),
                assets: Vec::new(),
                warnings: Vec::new(),
            }],
            assets: Vec::new(),
            warnings: Vec::new(),
        })
    }
}

fn split_paragraphs(text: &str) -> Vec<String> {
    let mut paragraphs = Vec::new();
    let mut current = Vec::new();

    for line in text.lines() {
        let trimmed = line.trim();
        if trimmed.is_empty() {
            flush_paragraph(&mut paragraphs, &mut current);
        } else {
            current.push(trimmed.to_owned());
        }
    }

    flush_paragraph(&mut paragraphs, &mut current);
    paragraphs
}

fn flush_paragraph(paragraphs: &mut Vec<String>, current: &mut Vec<String>) {
    if !current.is_empty() {
        paragraphs.push(current.join(" "));
        current.clear();
    }
}