three-dcf-core 0.2.0

Document-to-dataset encoding library for LLM training data preparation. Converts PDFs, Markdown, HTML into structured formats optimized for machine learning.
Documentation
use once_cell::sync::Lazy;
use regex::Regex;
use unicode_normalization::UnicodeNormalization;

use crate::document::CellType;

#[derive(Debug, Clone, Copy)]
pub struct ImportanceTuning {
    pub heading_boost: f32,
    pub number_boost: f32,
    pub footer_penalty: f32,
    pub early_line_bonus: f32,
}

impl Default for ImportanceTuning {
    fn default() -> Self {
        Self {
            heading_boost: 1.0,
            number_boost: 1.0,
            footer_penalty: 0.5,
            early_line_bonus: 1.0,
        }
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum HyphenationMode {
    Merge,
    Preserve,
}

pub fn normalize_line(line: &str) -> String {
    let trimmed = line.trim_matches(|c: char| c.is_control() || c.is_whitespace());
    let nfkc = trimmed.nfkc().collect::<String>();
    let mut result = String::with_capacity(nfkc.len());
    let mut prev_space = false;
    for ch in nfkc.chars() {
        if ch.is_control() {
            continue;
        }
        if ch.is_whitespace() {
            if !prev_space {
                result.push(' ');
                prev_space = true;
            }
        } else {
            result.push(ch);
            prev_space = false;
        }
    }
    result.trim().to_string()
}

pub fn normalize_lines(lines: &[String], mode: HyphenationMode) -> Vec<String> {
    let mut merged = match mode {
        HyphenationMode::Merge => merge_hyphenation(lines),
        HyphenationMode::Preserve => lines.to_vec(),
    };
    merged
        .drain(..)
        .map(|line| normalize_line(&line))
        .filter(|line| !line.is_empty())
        .collect()
}

fn merge_hyphenation(lines: &[String]) -> Vec<String> {
    let mut out = Vec::with_capacity(lines.len());
    let mut carry = String::new();
    for line in lines {
        let current = if carry.is_empty() {
            line.clone()
        } else {
            let mut combined = carry.clone();
            combined.push_str(line.trim_start());
            combined
        };
        let trimmed = current.trim_end().to_string();
        if trimmed.ends_with('-') && trimmed.len() > 1 {
            carry = trimmed.trim_end_matches('-').to_string();
            continue;
        }
        out.push(current);
        carry.clear();
    }
    if !carry.is_empty() {
        out.push(carry);
    }
    out
}

pub fn classify_cell_type(line: &str) -> CellType {
    if looks_like_table(line) {
        CellType::Table
    } else if looks_like_header(line) {
        CellType::Header
    } else if looks_like_footer(line) {
        CellType::Footer
    } else {
        CellType::Text
    }
}

pub fn importance_score(
    line: &str,
    cell_type: CellType,
    line_index: usize,
    tuning: &ImportanceTuning,
) -> u8 {
    let base = match cell_type {
        CellType::Header => 220,
        CellType::Footer => (40.0 * tuning.footer_penalty) as i32,
        CellType::Table => 160,
        _ => 100,
    };
    let heading_bonus = if is_all_caps(line) {
        (35.0 * tuning.heading_boost) as i32
    } else {
        0
    };
    let number_bonus = if contains_numbers(line) {
        (20.0 * tuning.number_boost) as i32
    } else {
        0
    };
    let early_bonus = if line_index < 5 {
        (15.0 * tuning.early_line_bonus) as i32
    } else {
        0
    };
    let length_penalty = (line.len() / 120) as i32 * -10;
    let score = base + heading_bonus + number_bonus + early_bonus + length_penalty;
    score.clamp(0, 255) as u8
}

fn looks_like_table(line: &str) -> bool {
    static TABLE_RE: Lazy<Regex> =
        Lazy::new(|| Regex::new(r"\b(total|subtotal|amount)\b.*\b(usd|eur|%)\b").unwrap());
    line.contains('|') || line.contains('\t') || TABLE_RE.is_match(&line.to_lowercase())
}

pub fn looks_like_table_with_tolerance(line: &str, tolerance_px: u32) -> bool {
    if looks_like_table(line) {
        return true;
    }
    let tokens = line.split_whitespace().collect::<Vec<_>>();
    if tokens.len() < 3 {
        return false;
    }
    let tolerance_chars = ((tolerance_px / 8).max(2)) as usize;
    longest_space_run(line) >= tolerance_chars
}

fn looks_like_header(line: &str) -> bool {
    line.chars().filter(|c| c.is_alphabetic()).count() > 3 && is_all_caps(line)
}

fn looks_like_footer(line: &str) -> bool {
    let lower = line.to_lowercase();
    lower.contains("page ") || lower.contains("confidential")
}

fn contains_numbers(line: &str) -> bool {
    line.chars().any(|c| c.is_ascii_digit())
}

fn is_all_caps(line: &str) -> bool {
    let letters: Vec<char> = line.chars().filter(|c| c.is_alphabetic()).collect();
    if letters.is_empty() {
        return false;
    }
    letters.iter().all(|c| c.is_uppercase())
}

fn longest_space_run(line: &str) -> usize {
    let mut current = 0;
    let mut best = 0;
    for ch in line.chars() {
        if ch == ' ' {
            current += 1;
            best = best.max(current);
        } else {
            current = 0;
        }
    }
    best
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn normalizes_whitespace() {
        let line = "  H e l l o   —  WORLD  ";
        assert_eq!(normalize_line(line), "H e l l o — WORLD");
    }

    #[test]
    fn detects_tables() {
        assert_eq!(classify_cell_type("| Col |"), CellType::Table);
        assert_eq!(classify_cell_type("TOTAL AMOUNT USD"), CellType::Table);
    }

    #[test]
    fn tolerance_detects_layout_tables() {
        assert!(looks_like_table_with_tolerance("Q1      Q2      Q3", 24));
        assert!(!looks_like_table_with_tolerance("Short line", 32));
    }
}