Skip to main content

three_dcf_core/
normalization.rs

1use once_cell::sync::Lazy;
2use regex::Regex;
3use unicode_normalization::UnicodeNormalization;
4
5use crate::document::CellType;
6
7#[derive(Debug, Clone, Copy)]
8pub struct ImportanceTuning {
9    pub heading_boost: f32,
10    pub number_boost: f32,
11    pub footer_penalty: f32,
12    pub early_line_bonus: f32,
13}
14
15impl Default for ImportanceTuning {
16    fn default() -> Self {
17        Self {
18            heading_boost: 1.0,
19            number_boost: 1.0,
20            footer_penalty: 0.5,
21            early_line_bonus: 1.0,
22        }
23    }
24}
25
26#[derive(Debug, Clone, Copy, PartialEq, Eq)]
27pub enum HyphenationMode {
28    Merge,
29    Preserve,
30}
31
32pub fn normalize_line(line: &str) -> String {
33    let trimmed = line.trim_matches(|c: char| c.is_control() || c.is_whitespace());
34    let nfkc = trimmed.nfkc().collect::<String>();
35    let mut result = String::with_capacity(nfkc.len());
36    let mut prev_space = false;
37    for ch in nfkc.chars() {
38        if ch.is_control() {
39            continue;
40        }
41        if ch.is_whitespace() {
42            if !prev_space {
43                result.push(' ');
44                prev_space = true;
45            }
46        } else {
47            result.push(ch);
48            prev_space = false;
49        }
50    }
51    result.trim().to_string()
52}
53
54pub fn normalize_lines(lines: &[String], mode: HyphenationMode) -> Vec<String> {
55    let mut merged = match mode {
56        HyphenationMode::Merge => merge_hyphenation(lines),
57        HyphenationMode::Preserve => lines.to_vec(),
58    };
59    merged
60        .drain(..)
61        .map(|line| normalize_line(&line))
62        .filter(|line| !line.is_empty())
63        .collect()
64}
65
66fn merge_hyphenation(lines: &[String]) -> Vec<String> {
67    let mut out = Vec::with_capacity(lines.len());
68    let mut carry = String::new();
69    for line in lines {
70        let current = if carry.is_empty() {
71            line.clone()
72        } else {
73            let mut combined = carry.clone();
74            combined.push_str(line.trim_start());
75            combined
76        };
77        let trimmed = current.trim_end().to_string();
78        if trimmed.ends_with('-') && trimmed.len() > 1 {
79            carry = trimmed.trim_end_matches('-').to_string();
80            continue;
81        }
82        out.push(current);
83        carry.clear();
84    }
85    if !carry.is_empty() {
86        out.push(carry);
87    }
88    out
89}
90
91pub fn classify_cell_type(line: &str) -> CellType {
92    if looks_like_table(line) {
93        CellType::Table
94    } else if looks_like_header(line) {
95        CellType::Header
96    } else if looks_like_footer(line) {
97        CellType::Footer
98    } else {
99        CellType::Text
100    }
101}
102
103pub fn importance_score(
104    line: &str,
105    cell_type: CellType,
106    line_index: usize,
107    tuning: &ImportanceTuning,
108) -> u8 {
109    let base = match cell_type {
110        CellType::Header => 220,
111        CellType::Footer => (40.0 * tuning.footer_penalty) as i32,
112        CellType::Table => 160,
113        _ => 100,
114    };
115    let heading_bonus = if is_all_caps(line) {
116        (35.0 * tuning.heading_boost) as i32
117    } else {
118        0
119    };
120    let number_bonus = if contains_numbers(line) {
121        (20.0 * tuning.number_boost) as i32
122    } else {
123        0
124    };
125    let early_bonus = if line_index < 5 {
126        (15.0 * tuning.early_line_bonus) as i32
127    } else {
128        0
129    };
130    let length_penalty = (line.len() / 120) as i32 * -10;
131    let score = base + heading_bonus + number_bonus + early_bonus + length_penalty;
132    score.clamp(0, 255) as u8
133}
134
135fn looks_like_table(line: &str) -> bool {
136    static TABLE_RE: Lazy<Regex> =
137        Lazy::new(|| Regex::new(r"\b(total|subtotal|amount)\b.*\b(usd|eur|%)\b").unwrap());
138    line.contains('|') || line.contains('\t') || TABLE_RE.is_match(&line.to_lowercase())
139}
140
141pub fn looks_like_table_with_tolerance(line: &str, tolerance_px: u32) -> bool {
142    if looks_like_table(line) {
143        return true;
144    }
145    let tokens = line.split_whitespace().collect::<Vec<_>>();
146    if tokens.len() < 3 {
147        return false;
148    }
149    let tolerance_chars = ((tolerance_px / 8).max(2)) as usize;
150    longest_space_run(line) >= tolerance_chars
151}
152
153fn looks_like_header(line: &str) -> bool {
154    line.chars().filter(|c| c.is_alphabetic()).count() > 3 && is_all_caps(line)
155}
156
157fn looks_like_footer(line: &str) -> bool {
158    let lower = line.to_lowercase();
159    lower.contains("page ") || lower.contains("confidential")
160}
161
162fn contains_numbers(line: &str) -> bool {
163    line.chars().any(|c| c.is_ascii_digit())
164}
165
166fn is_all_caps(line: &str) -> bool {
167    let letters: Vec<char> = line.chars().filter(|c| c.is_alphabetic()).collect();
168    if letters.is_empty() {
169        return false;
170    }
171    letters.iter().all(|c| c.is_uppercase())
172}
173
174fn longest_space_run(line: &str) -> usize {
175    let mut current = 0;
176    let mut best = 0;
177    for ch in line.chars() {
178        if ch == ' ' {
179            current += 1;
180            best = best.max(current);
181        } else {
182            current = 0;
183        }
184    }
185    best
186}
187
188#[cfg(test)]
189mod tests {
190    use super::*;
191
192    #[test]
193    fn normalizes_whitespace() {
194        let line = "  H e l l o   —  WORLD  ";
195        assert_eq!(normalize_line(line), "H e l l o — WORLD");
196    }
197
198    #[test]
199    fn detects_tables() {
200        assert_eq!(classify_cell_type("| Col |"), CellType::Table);
201        assert_eq!(classify_cell_type("TOTAL AMOUNT USD"), CellType::Table);
202    }
203
204    #[test]
205    fn tolerance_detects_layout_tables() {
206        assert!(looks_like_table_with_tolerance("Q1      Q2      Q3", 24));
207        assert!(!looks_like_table_with_tolerance("Short line", 32));
208    }
209}