three_dcf_core/
normalization.rs1use once_cell::sync::Lazy;
2use regex::Regex;
3use unicode_normalization::UnicodeNormalization;
4
5use crate::document::CellType;
6
7#[derive(Debug, Clone, Copy)]
8pub struct ImportanceTuning {
9 pub heading_boost: f32,
10 pub number_boost: f32,
11 pub footer_penalty: f32,
12 pub early_line_bonus: f32,
13}
14
15impl Default for ImportanceTuning {
16 fn default() -> Self {
17 Self {
18 heading_boost: 1.0,
19 number_boost: 1.0,
20 footer_penalty: 0.5,
21 early_line_bonus: 1.0,
22 }
23 }
24}
25
26#[derive(Debug, Clone, Copy, PartialEq, Eq)]
27pub enum HyphenationMode {
28 Merge,
29 Preserve,
30}
31
32pub fn normalize_line(line: &str) -> String {
33 let trimmed = line.trim_matches(|c: char| c.is_control() || c.is_whitespace());
34 let nfkc = trimmed.nfkc().collect::<String>();
35 let mut result = String::with_capacity(nfkc.len());
36 let mut prev_space = false;
37 for ch in nfkc.chars() {
38 if ch.is_control() {
39 continue;
40 }
41 if ch.is_whitespace() {
42 if !prev_space {
43 result.push(' ');
44 prev_space = true;
45 }
46 } else {
47 result.push(ch);
48 prev_space = false;
49 }
50 }
51 result.trim().to_string()
52}
53
54pub fn normalize_lines(lines: &[String], mode: HyphenationMode) -> Vec<String> {
55 let mut merged = match mode {
56 HyphenationMode::Merge => merge_hyphenation(lines),
57 HyphenationMode::Preserve => lines.to_vec(),
58 };
59 merged
60 .drain(..)
61 .map(|line| normalize_line(&line))
62 .filter(|line| !line.is_empty())
63 .collect()
64}
65
66fn merge_hyphenation(lines: &[String]) -> Vec<String> {
67 let mut out = Vec::with_capacity(lines.len());
68 let mut carry = String::new();
69 for line in lines {
70 let current = if carry.is_empty() {
71 line.clone()
72 } else {
73 let mut combined = carry.clone();
74 combined.push_str(line.trim_start());
75 combined
76 };
77 let trimmed = current.trim_end().to_string();
78 if trimmed.ends_with('-') && trimmed.len() > 1 {
79 carry = trimmed.trim_end_matches('-').to_string();
80 continue;
81 }
82 out.push(current);
83 carry.clear();
84 }
85 if !carry.is_empty() {
86 out.push(carry);
87 }
88 out
89}
90
91pub fn classify_cell_type(line: &str) -> CellType {
92 if looks_like_table(line) {
93 CellType::Table
94 } else if looks_like_header(line) {
95 CellType::Header
96 } else if looks_like_footer(line) {
97 CellType::Footer
98 } else {
99 CellType::Text
100 }
101}
102
103pub fn importance_score(
104 line: &str,
105 cell_type: CellType,
106 line_index: usize,
107 tuning: &ImportanceTuning,
108) -> u8 {
109 let base = match cell_type {
110 CellType::Header => 220,
111 CellType::Footer => (40.0 * tuning.footer_penalty) as i32,
112 CellType::Table => 160,
113 _ => 100,
114 };
115 let heading_bonus = if is_all_caps(line) {
116 (35.0 * tuning.heading_boost) as i32
117 } else {
118 0
119 };
120 let number_bonus = if contains_numbers(line) {
121 (20.0 * tuning.number_boost) as i32
122 } else {
123 0
124 };
125 let early_bonus = if line_index < 5 {
126 (15.0 * tuning.early_line_bonus) as i32
127 } else {
128 0
129 };
130 let length_penalty = (line.len() / 120) as i32 * -10;
131 let score = base + heading_bonus + number_bonus + early_bonus + length_penalty;
132 score.clamp(0, 255) as u8
133}
134
135fn looks_like_table(line: &str) -> bool {
136 static TABLE_RE: Lazy<Regex> =
137 Lazy::new(|| Regex::new(r"\b(total|subtotal|amount)\b.*\b(usd|eur|%)\b").unwrap());
138 line.contains('|') || line.contains('\t') || TABLE_RE.is_match(&line.to_lowercase())
139}
140
141pub fn looks_like_table_with_tolerance(line: &str, tolerance_px: u32) -> bool {
142 if looks_like_table(line) {
143 return true;
144 }
145 let tokens = line.split_whitespace().collect::<Vec<_>>();
146 if tokens.len() < 3 {
147 return false;
148 }
149 let tolerance_chars = ((tolerance_px / 8).max(2)) as usize;
150 longest_space_run(line) >= tolerance_chars
151}
152
153fn looks_like_header(line: &str) -> bool {
154 line.chars().filter(|c| c.is_alphabetic()).count() > 3 && is_all_caps(line)
155}
156
157fn looks_like_footer(line: &str) -> bool {
158 let lower = line.to_lowercase();
159 lower.contains("page ") || lower.contains("confidential")
160}
161
162fn contains_numbers(line: &str) -> bool {
163 line.chars().any(|c| c.is_ascii_digit())
164}
165
166fn is_all_caps(line: &str) -> bool {
167 let letters: Vec<char> = line.chars().filter(|c| c.is_alphabetic()).collect();
168 if letters.is_empty() {
169 return false;
170 }
171 letters.iter().all(|c| c.is_uppercase())
172}
173
174fn longest_space_run(line: &str) -> usize {
175 let mut current = 0;
176 let mut best = 0;
177 for ch in line.chars() {
178 if ch == ' ' {
179 current += 1;
180 best = best.max(current);
181 } else {
182 current = 0;
183 }
184 }
185 best
186}
187
188#[cfg(test)]
189mod tests {
190 use super::*;
191
192 #[test]
193 fn normalizes_whitespace() {
194 let line = " H e l l o — WORLD ";
195 assert_eq!(normalize_line(line), "H e l l o — WORLD");
196 }
197
198 #[test]
199 fn detects_tables() {
200 assert_eq!(classify_cell_type("| Col |"), CellType::Table);
201 assert_eq!(classify_cell_type("TOTAL AMOUNT USD"), CellType::Table);
202 }
203
204 #[test]
205 fn tolerance_detects_layout_tables() {
206 assert!(looks_like_table_with_tolerance("Q1 Q2 Q3", 24));
207 assert!(!looks_like_table_with_tolerance("Short line", 32));
208 }
209}